From 01183f8ffcdf3943a32b273bfc0d2dd11d694dfa Mon Sep 17 00:00:00 2001 From: Henri Menke Date: Thu, 20 Apr 2017 10:13:17 +1200 Subject: [PATCH] [core] Remove SD --- doc/sphinx/features.rst | 13 - doc/sphinx/introduction.rst | 2 - doc/sphinx/setup.rst | 24 - doc/ug/features.tex | 9 - doc/ug/introduction.tex | 1 - doc/ug/run.tex | 72 - doc/ug/setup.tex | 26 - src/core/global.cpp | 18 +- src/core/global.hpp | 20 +- src/core/integrate.cpp | 9 - src/core/integrate_sd.cpp | 648 --------- src/core/integrate_sd.hpp | 123 -- src/core/integrate_sd_cuda.cu | 1743 ----------------------- src/core/integrate_sd_cuda.hpp | 251 ---- src/core/integrate_sd_cuda_debug.cu | 337 ----- src/core/integrate_sd_cuda_debug.hpp | 143 -- src/core/integrate_sd_cuda_device.cu | 130 -- src/core/integrate_sd_cuda_device.hpp | 43 - src/core/integrate_sd_cuda_kernel.cu | 1884 ------------------------- src/core/integrate_sd_cuda_kernel.hpp | 191 --- src/core/integrate_sd_matrix.cu | 283 ---- src/core/integrate_sd_matrix.hpp | 72 - src/core/thermostat.hpp | 2 - 23 files changed, 11 insertions(+), 6033 deletions(-) delete mode 100644 src/core/integrate_sd.cpp delete mode 100644 src/core/integrate_sd.hpp delete mode 100644 src/core/integrate_sd_cuda.cu delete mode 100644 src/core/integrate_sd_cuda.hpp delete mode 100644 src/core/integrate_sd_cuda_debug.cu delete mode 100644 src/core/integrate_sd_cuda_debug.hpp delete mode 100644 src/core/integrate_sd_cuda_device.cu delete mode 100644 src/core/integrate_sd_cuda_device.hpp delete mode 100644 src/core/integrate_sd_cuda_kernel.cu delete mode 100644 src/core/integrate_sd_cuda_kernel.hpp delete mode 100644 src/core/integrate_sd_matrix.cu delete mode 100644 src/core/integrate_sd_matrix.hpp diff --git a/doc/sphinx/features.rst b/doc/sphinx/features.rst index 504acdebcf3..bd5e4afb696 100644 --- a/doc/sphinx/features.rst +++ b/doc/sphinx/features.rst @@ -138,16 +138,6 @@ integrator or thermostat: - Enables the implicit calculation of electro-hydrodynamics for charged particles and salt ions in an electric field. -- enable Stokesian Dynamics. - -- disable periodic boundary conditions in Stokesian Dynamics. - -- use float instead of double in Stokesian Dynamics. - -- disable nearfield of Stokesian Dynamics. - -- enable debug Stokesian Dynamics. - Interactions ------------ @@ -279,6 +269,3 @@ looking directly at the code. - Causes to try to provoke a core dump when exiting unexpectedly. - Causes to try this even with MPI errors. - -- Causes to check more things in the Stokesian Dynamics code. If is - larger 1, SD prints more informations. diff --git a/doc/sphinx/introduction.rst b/doc/sphinx/introduction.rst index b8db0ab80a6..5df4a43398b 100644 --- a/doc/sphinx/introduction.rst +++ b/doc/sphinx/introduction.rst @@ -183,8 +183,6 @@ report so to the developers. +--------------------------------+------------------------+------------------+ | Tunable Slip Boundary | Single | Single | +--------------------------------+------------------------+------------------+ -| Stokesian Dynamics | Single | Single | -+--------------------------------+------------------------+------------------+ | **Analysis** | +--------------------------------+------------------------+------------------+ | uwerr | None | Good | diff --git a/doc/sphinx/setup.rst b/doc/sphinx/setup.rst index 70f2b7779b9..883578e0420 100644 --- a/doc/sphinx/setup.rst +++ b/doc/sphinx/setup.rst @@ -132,19 +132,6 @@ re-used. (int) if non-zero (default), some warnings are printed out. Set this to zero if you get annoyed by them. -(double) viscosity of the fluid for the Stokesian Dynamics simulation. - -(double) hydrodynamic radius of the particles used in Stokesian -Dynamics. - -(int[2]) seed of the Stokes Dynamics random number generator. - -(int[2]) offset of the random number generator. Together with the seed, -the state of the random number generator is well defined. - -(double) precision used for the approximation of the square root of the -mobility. Sometimes higher accuracy can speedup the simulation. - Setting global variables in Python ---------------------------------- @@ -508,17 +495,6 @@ unit operating at a constant temperature. Be aware that this thermostat requires to be given in Kelvin. -Stokesian Dynamics thermostat -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. todo:: - This is not implemented yet for the python interface - -This thermostat should be used together with the Stokesian Dynamics -implementation. No other thermostat is able to thermalize SD correctly. -The precision of the farfield contribution of the thermostat can be -tuned with - .. _nemd: ``nemd``: Setting up non-equilibrium MD diff --git a/doc/ug/features.tex b/doc/ug/features.tex index 0447f3b08a8..d3a2130a944 100755 --- a/doc/ug/features.tex +++ b/doc/ug/features.tex @@ -146,13 +146,6 @@ \section{General features} \item \newfeature{LB_ELECTROHYDRODYNAMICS} Enables the implicit calculation of electro-hydrodynamics for charged particles and salt ions in an electric field. -\item \newfeature{SD} enable Stokesian Dynamics. -\item \newfeature{SD_NOT_PERIODIC} disable periodic boundary conditions in - Stokesian Dynamics. -\item \newfeature{SD_USE_FLOAT} use float instead of double in Stokesian - Dynamics. -\item \newfeature{SD_FF_ONLY} disable nearfield of Stokesian Dynamics. -\item \newfeature{SD_DEBUG} enable debug Stokesian Dynamics. \end{itemize} @@ -264,8 +257,6 @@ \section{Debug messages} \item \newfeature{FORCE\_CORE} Causes \es{} to try to provoke a core dump when exiting unexpectedly. \item \newfeature{MPI\_CORE} Causes \es{} to try this even with MPI errors. -\item \newfeature{SD_DEBUG} Causes \es{} to check more things in the Stokesian - Dynamics code. If \var{warnings} is larger 1, SD prints more information. \end{itemize} %%% Local Variables: diff --git a/doc/ug/introduction.tex b/doc/ug/introduction.tex index fdef43da751..9b0cefd88c1 100755 --- a/doc/ug/introduction.tex +++ b/doc/ug/introduction.tex @@ -166,7 +166,6 @@ \section{Available simulation methods} DPD & None & Good \\ Shan-Chen Multicomponent Fluid & Group & Group \\ Tunable Slip Boundary & Single & Single \\ - Stokesian Dynamics & Single & Single \\ \multicolumn{3}{|c|}{\textbf{Analysis}} \\ uwerr & None & Good \\ \multicolumn{3}{|c|}{\textbf{Input/Output}} \\ diff --git a/doc/ug/run.tex b/doc/ug/run.tex index 4048abd9de5..0a52327f051 100755 --- a/doc/ug/run.tex +++ b/doc/ug/run.tex @@ -588,78 +588,6 @@ \subsubsection{Details on implementation} codes) but so-called Lucy functions. See \cite{marsili09} for more details. These avoid the calculation of exponentials. -\section{\texttt{integrate_sd}: Running a stokesian dynamics simulation} -\newescommand[integrate-sd]{integrate_sd} - -\begin{essyntax} - \variant{1} integrate_sd \var{steps} -\end{essyntax} - -\es uses in the stokesian dynamics algorithm the euler integrater for the -equations of motion. The motion are overdamped. The velocities of the -particles are related to the displacement in the last timestep. This are, at -least if the system is thermalized, however not usefull, as the displacements -don't scale linear with the timestep. The command \texttt{integrate_sd} with -an integer \var{steps} as parameter integrates the system for -\var{steps} time steps. This is implemented using CUDA, so \required{CUDA} -has to be available in the system. - -Currently there is no parallel implementation of this integrator, so all -particles have to be in a single process. - -\subsection{Setting up the system} -Before running a stokesian dynamics simulation, you have to set a view -constants: -\begin{globvar} -\item[sd_radius] The hydrodynamic particle radius of the (sperical) - particles. Only one particle size is supported. -\item[sd_viscosity] The viscisity of the fluid. Remember this is only a - scaling of the timestep, so you can set it without problems to one. -\item[sd_seed] (int[2]) seed of the Stokes Dynamics random number generator. - As the generator used for the SD runes on the GPU (cuRAND) it has its own - seed and own state. -\item[sd_random_state] (int[2]) offset of the random number - generator. Together with the seed, the state of the random number generator - is well defined. -\item[sd_precision_random] (double) precision used for the approximation of - the square root of the mobility. Sometimes higher accuracy can speedup the - simulation. -\end{globvar} - -Make sure to only use the stokesian dynamics thermostat. If you made a warmup -integration without SD, set it with: -\begin{tclcode} -thermostat off -thermostat sd \$temp -\end{tclcode} - -\subsection{Periodicity} -The Code uses the ewald-summation (as derived in \cite{beenakker86a}) of the -Rotne-Prager tensor to be usefull in periodic boundary conditions. To disable -the peridoicity of the hydrodynamic, use the feature -\feature{SD_NOT_PERIODIC}. - -The ewald summation is required if the system is periodic, as otherwise an -cutoff is introduced, which can lead to negative eigenvalues of the -mobility. This leads to a break down of the code, as it is not possible to -thermalize such a system. - -\subsection{Floatingpoint precision} -It is possible to switch between double and float as datatype in Stokesian -Dynamics. As GPUs have more single than double precision floating point -units, the single precision is faster. However this can lead to problems, if -the eigenvalues are not precise enough and get negative. Therfore the default -is double precision. If you want to use single precision, use the feature -\feature{SD_USE_FLOAT}. - -\subsection{Farfield only} -The mobility matrix consists of two contribution, the farfield and the -nearfield lubrication correction. -If the nearfield is neglegible, than the feature \feature{SD_FF_ONLY} can be -used. -This should speedup the simulation, as the nearfield doesn't need to be -inverted. Additional the thermal displacements can be directly calculated. - \section{Multi-timestepping} \newescommand[multitimestepping]{multitimestepping} diff --git a/doc/ug/setup.tex b/doc/ug/setup.tex index 7c6918f3835..99882f3e3eb 100755 --- a/doc/ug/setup.tex +++ b/doc/ug/setup.tex @@ -151,17 +151,6 @@ \section{\texttt{setmd}: Setting global variables in TCL} verlet list has been re-used. \item[warnings] (int) if non-zero (default), some warnings are printed out. Set this to zero if you get annoyed by them. -\item[sd_viscosity] (double) viscosity of the fluid for the Stokesian Dynamics - simulation. -\item[sd_radius] (double) hydrodynamic radius of the particles used in - Stokesian Dynamics. -\item[sd_seed] (int[2]) seed of the Stokes Dynamics random number generator. -\item[sd_random_state] (int[2]) offset of the random number - generator. Together with the seed, the state of the random number generator - is well defined. -\item[sd_precision_random] (double) precision used for the approximation of - the square root of the mobility. Sometimes higher accuracy can speedup the - simulation. \end{globvar} @@ -602,21 +591,6 @@ \subsection{CPU thermostat} Be aware that this thermostat requires \var{temperature} to be given in Kelvin. -\subsection{Stokesian Dynamics thermostat} -\label{ssec:SDthermostat} -\begin{essyntax} - thermostat sd \var{temperature} - \begin{features} - \required{CUDA} and \required{SD} - \end{features} -\end{essyntax} - -This thermostat should be used together with the Stokesian Dynamics -implementation. -No other thermostat is able to thermalize SD correctly. -The precision of the farfield contribution of the thermostat can be tuned -with - \section{\texttt{nemd}: Setting up non-equilibrium MD} \newescommand{nemd} \label{sec:NEMD} diff --git a/src/core/global.cpp b/src/core/global.cpp index 6bb972f3abe..2b22b44f8c7 100755 --- a/src/core/global.cpp +++ b/src/core/global.cpp @@ -34,7 +34,6 @@ #include "imd.hpp" #include "ghmc.hpp" #include "lb.hpp" -#include "integrate_sd.hpp" /** This array contains the description of all global variables. @@ -99,19 +98,14 @@ const Datafield fields[] = { {&lb_components, TYPE_INT, 1, "lb_components", 2, false }, /* 51 from ghmc.cpp */ {&warnings, TYPE_INT, 1, "warnings", 1, false }, /* 52 from global.cpp */ {&dpd_ignore_fixed_particles, TYPE_INT, 1, "dpd_ignore_fixed_particles", 1, false }, /* 53 from global.cpp */ - {&sd_viscosity, TYPE_DOUBLE, 1, "sd_viscosoity", 4, false }, /* 54 from integrate_sd.cpp */ - {&sd_radius, TYPE_DOUBLE, 1, "sd_radius", 4, false }, /* 55 from integrate_sd.cpp */ - {&sd_seed, TYPE_INT, 2, "sd_seed", 4, false }, /* 56 from integrate_sd.cpp */ - {&sd_random_state, TYPE_INT, 2, "sd_random_state", 4, false }, /* 57 from integrate_sd.cpp */ - {&sd_random_precision, TYPE_DOUBLE, 1, "sd_precision_random", 4, false }, /* 58 from integrate_sd.cpp */ - {&smaller_time_step,TYPE_DOUBLE,1, "smaller_time_step", 5, false }, /* 59 from integrate.cpp */ - {configtemp, TYPE_DOUBLE, 2, "configtemp", 1, false }, /* 60 from integrate.cpp */ - {&langevin_trans, TYPE_BOOL, 1, "langevin_trans_switch", 1, false }, /* 61 from thermostat.cpp */ - {&langevin_rotate, TYPE_BOOL, 1, "langevin_rotate_switch", 1, false }, /* 62 from thermostat.cpp */ + {&smaller_time_step,TYPE_DOUBLE,1, "smaller_time_step", 5, false }, /* 54 from integrate.cpp */ + {configtemp, TYPE_DOUBLE, 2, "configtemp", 1, false }, /* 55 from integrate.cpp */ + {&langevin_trans, TYPE_BOOL, 1, "langevin_trans_switch", 1, false }, /* 56 from thermostat.cpp */ + {&langevin_rotate, TYPE_BOOL, 1, "langevin_rotate_switch", 1, false }, /* 57 from thermostat.cpp */ #ifndef ROTATIONAL_INERTIA - {&langevin_gamma_rotation, TYPE_DOUBLE, 1, "gamma_rot",1, false }, /* 63 from thermostat.cpp */ + {&langevin_gamma_rotation, TYPE_DOUBLE, 1, "gamma_rot",1, false }, /* 58 from thermostat.cpp */ #else - {langevin_gamma_rotation, TYPE_DOUBLE, 3, "gamma_rot",1, false }, /* 63 from thermostat.cpp */ + {langevin_gamma_rotation, TYPE_DOUBLE, 3, "gamma_rot",1, false }, /* 58 from thermostat.cpp */ #endif { NULL, 0, 0, NULL, 0, false } }; diff --git a/src/core/global.hpp b/src/core/global.hpp index 54e7004be9e..5c3dbcd61ac 100755 --- a/src/core/global.hpp +++ b/src/core/global.hpp @@ -185,26 +185,16 @@ extern const Datafield fields[]; #define FIELD_WARNINGS 52 /** DPD_IGNORE_FIXED_PARTICLES */ #define FIELD_DPD_IGNORE_FIXED_PARTICLES 53 -/** index of \ref sd_viscosity in \ref #fields */ -#define FIELD_SD_VISCOSITY 54 -/** index of \ref sd_radius in \ref_#fields */ -#define FIELD_SD_RADIUS 55 -/** index of \ref sd_seed in \ref #fields */ -#define FIELD_SD_SEED 56 -/** index of \ref sd_random_state i_ \ref #fields */ -#define FIELD_SD_RANDOM_STATE 57 -/** index of \ref sd_precision_random in \ref #fields */ -#define FIELD_SD_RANDOM_PRECISION 58 /** index of \ref smaller_timestep in \ref #fields */ -#define FIELD_SMALLERTIMESTEP 59 +#define FIELD_SMALLERTIMESTEP 54 /** index of \ref configtemp in \ref #fields */ -#define FIELD_CONFIGTEMP 60 +#define FIELD_CONFIGTEMP 55 /** index of \ref langevin_trans in \ref #fields */ -#define FIELD_LANGEVIN_TRANS_SWITCH 61 +#define FIELD_LANGEVIN_TRANS_SWITCH 56 /** index of \ref langevin_rotate in \ref #fields */ -#define FIELD_LANGEVIN_ROT_SWITCH 62 +#define FIELD_LANGEVIN_ROT_SWITCH 57 /** index of \ref langevin_gamma_rotation in \ref #fields */ -#define FIELD_LANGEVIN_GAMMA_ROTATION 63 +#define FIELD_LANGEVIN_GAMMA_ROTATION 58 /*@}*/ diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp index e10871348d8..eff1ef9e32e 100755 --- a/src/core/integrate.cpp +++ b/src/core/integrate.cpp @@ -361,15 +361,6 @@ void integrate_vv(int n_steps, int reuse_forces) { } #endif -#ifdef SD - if (thermo_switch & THERMO_SD) { - runtimeWarning("Use integrate_sd to use Stokesian Dynamics Thermalizer."); - } - if (thermo_switch & THERMO_BD) { - runtimeWarning("Use integrate_sd to use Brownian Dynamics Thermalizer."); - } -#endif - /* Integration Steps: Step 1 and 2 of Velocity Verlet scheme: v(t+0.5*dt) = v(t) + 0.5*dt * f(t) p(t + dt) = p(t) + dt * v(t+0.5*dt) diff --git a/src/core/integrate_sd.cpp b/src/core/integrate_sd.cpp deleted file mode 100644 index 7b61f94eaeb..00000000000 --- a/src/core/integrate_sd.cpp +++ /dev/null @@ -1,648 +0,0 @@ -/* - Copyright (C) 2010,2011,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -0*/ - -/** \file integrate_sd.cpp Stokesian dynamics integrator. - * - * For more information about the integrator - * see \ref integrate_sd.hpp "integrate_sd.hpp". -*/ - -#include -#include -#include -#include -#include -#include -#include "utils.hpp" -#include "integrate_sd.hpp" -#include "integrate.hpp" -#include "reaction.hpp" -#include "electrokinetics.hpp" -#include "interaction_data.hpp" -#include "particle_data.hpp" -#include "communication.hpp" -#include "grid.hpp" -#include "cells.hpp" -#include "verlet.hpp" -#include "rotation.hpp" -#include "ghosts.hpp" -#include "pressure.hpp" -#include "p3m.hpp" -#include "maggs.hpp" -#include "thermostat.hpp" -#include "initialize.hpp" -#include "forces.hpp" -#include "nsquare.hpp" -#include "domain_decomposition.hpp" -#include "layered.hpp" -#include "nemd.hpp" -#include "rattle.hpp" -#include "errorhandling.hpp" -#include "lattice.hpp" -#include "lb.hpp" -#include "virtual_sites.hpp" -#include "correlators/Correlator.hpp" -#include "ghmc.hpp" - -using std::ostringstream; - -/************************************************ - * DEFINES - ************************************************/ - -/** Tag for communication in verlet fix: propagate_positions() */ -#define REQ_INT_VERLET 400 - -/******************* variables *******************/ - -/// switch between double and float -#ifdef SD_USE_FLOAT -typedef float real; -#else -typedef double real; -#endif - -/******************* variables *******************/ - -double sd_viscosity=1; -double sd_radius=-1; - -int sd_seed[]={0,('E'+'S'+'P'+'R'+'e'+'s'+'S'+'o')}; -int sd_random_state[]={0,0}; -double sd_random_precision=1e-3; - -/** \name Privat Functions */ -/************************************************************/ -/*@{*/ - -/** Propagate the positions. Integration step:
- \f[ p(t+\Delta t) = p(t) + \Delta t \mu f(t) \f] */ -void propagate_pos_sd(); - -#ifdef CUDA -void propagate_pos_sd_cuda(real * box_l, int N,real * pos_h, real * force_h, real * velo_h); -#endif - -void propagate_pos_bd(int N, real * pos, real * force, real * velocity); - -int sd_get_particle_num(); -/*@}*/ - -void integrator_sanity_checks_sd() -{ - if ( time_step < 0.0 ) { - ostringstream msg; - msg <<"time_step not set"; - runtimeError(msg); - } - if ( temperature < 0.0 ) { - ostringstream msg; - msg <<"thermostat not initialized"; - runtimeError(msg); - } - if (sd_radius < 0) { - ostringstream msg; - msg <<"Stokesian Dynamics Hydrodynamic particle radius not initialized"; - runtimeError(msg); - } - if (sd_viscosity < 0) { - ostringstream msg; - msg <<"Stokesian Dynamics fluid viscosity not initialized"; - runtimeError(msg); - } -} - -/************************************************************/ - - -/************************************************************/ - -#if defined(SD) || defined(BD) - -void integrate_sd(int n_steps) -{ - /* Prepare the Integrator */ - on_integration_start(); - - /* if any method vetoes (P3M not initialized), immediately bail out */ - if (check_runtime_errors()) - return; - - INTEG_TRACE(fprintf(stderr,"%d: integrate_vv: integrating %d steps (recalc_forces=%d)\n", - this_node, n_steps, recalc_forces)); - - /* Integration Step: - Calculate forces f(t) as function of positions p(t) ( and velocities v(t) ) */ - //if (recalc_forces) { - //thermo_heat_up(); - - - ghost_communicator(&cell_structure.collect_ghost_force_comm); - -#ifdef ROTATION - convert_initial_torques(); -#endif - - //thermo_cool_down(); - - /* Communication Step: ghost forces */ - - - /*apply trap forces to trapped molecules*/ -#ifdef MOLFORCES - // prob. works only with harmonic bounds - calc_and_apply_mol_constraints(); -#endif - - /* should be pretty late, since it needs to zero out the total force */ -#ifdef COMFIXED - calc_comfixed(); -#endif - - //rescale_forces(); - -#ifdef COLLISION_DETECTION - //should not be neccessery, as integrator avoids collision - handle_collisions(); -#endif - // end of force calculation - -#ifdef GHMC - if(thermo_switch & THERMO_GHMC) - ghmc_init(); -#endif - - if (check_runtime_errors()) - return; - - n_verlet_updates = 0; - - /* Integration loop */ - for(int step=0;step0) verlet_reuse = n_steps/(double) n_verlet_updates; - else verlet_reuse = 0; - -#ifdef NPT - if(integ_switch == INTEG_METHOD_NPT_ISO) { - nptiso.invalidate_p_vel = 0; - MPI_Bcast(&nptiso.p_inst, 1, MPI_DOUBLE, 0, comm_cart); - MPI_Bcast(&nptiso.p_diff, 1, MPI_DOUBLE, 0, comm_cart); - MPI_Bcast(&nptiso.volume, 1, MPI_DOUBLE, 0, comm_cart); - if(this_node==0) nptiso.p_inst_av /= 1.0*n_steps; - MPI_Bcast(&nptiso.p_inst_av, 1, MPI_DOUBLE, 0, comm_cart); - } -#endif - -#ifdef GHMC - if(thermo_switch & THERMO_GHMC) - ghmc_close(); -#endif - -} - -//#endif /* SD */ - -/* Privat functions */ -/************************************************************/ - -//#ifdef SD - -void propagate_pos_sd() -{ - /* Verlet list criterion */ - double skin2 = SQR(0.5 * skin); - - INTEG_TRACE(fprintf(stderr,"%d: propagate_pos:\n",this_node)); - Cell *cell; - Particle *p; - int c, i, np; - //get total number of particles - int N=sd_get_particle_num(); - // gather all the data for mobility calculation - real * pos=NULL; - pos=(real *)Utils::malloc(DIM*N*sizeof(double)); - assert(pos!=NULL); - real * force=NULL; - force=(real *)Utils::malloc(DIM*N*sizeof(double)); - assert(force!=NULL); - real * velocity=NULL; - velocity=(real *)Utils::malloc(DIM*N*sizeof(real)); - assert(velocity!=NULL); -#ifdef EXTERNAL_FORCES - const int COORD_ALL=COORD_FIXED(0)&COORD_FIXED(1)&COORD_FIXED(2); -#endif - int j=0; // total particle counter - for (c = 0; c < local_cells.n; c++){ - cell = local_cells.cell[c]; - p = cell->part; - np = cell->n; - for (i = 0; i < np; i++) { // only count nonVirtual Particles -#ifdef EXTERNAL_FORCES - if (p[i].p.ext_flag & COORD_ALL) - { - fprintf (stderr, "Warning: Fixing particle in StokesDynamics this way with EXTERNAL_FORCES is not possible (and will be ignored). Please try to bind them e.g. harmonicaly.\n"); - } -#endif -#ifdef VIRTUAL_SITES - if (!ifParticleIsVirtual(&p[i])) -#endif - { -#ifdef SD_USE_FLOAT - for (int d=0;d<3;d++){ - pos[3*j+d] = p[i].r.p[d]; - pos[3*j+d] -=rint(pos[3*j+d]/box_l[d])*box_l[d]; - force[3*j+d] = p[i].f.f[d]; - } -#else - memmove(&pos[3*j], p[i].r.p, 3*sizeof(double)); - memmove(&force[3*j], p[i].f.f, 3*sizeof(double)); - for (int d=0;d<3;d++){ - pos[3*j+d] -=rint(pos[3*j+d]/box_l[d])*box_l[d]; - } -#endif - j++; - } - } - } - if (!(thermo_switch & THERMO_SD) && thermo_switch & THERMO_BD){ - propagate_pos_bd(N,pos,force, velocity); - } else { - // cuda part -#ifdef CUDA - //void propagate_pos_sd_cuda(double * box_l_h, int N,double * pos_h, double * force_h, double * velo_h){ -#ifdef SD_USE_FLOAT - real box_size[3]; - for (int d=0;d<3;d++){ - box_size[d]=box_l[d]; - } -#else - real * box_size = box_l; -#endif - if (!(thermo_switch & THERMO_SD)){ - temperature*=-1; - } - propagate_pos_sd_cuda(box_size,N,pos,force, velocity); - if (!(thermo_switch & THERMO_SD)){ - temperature*=-1; - } -#else - fprintf(stderr, "Warning - CUDA is currently required for SD\n"); - fprintf(stderr, "So i am just sitting here and copying stupidly stuff :'(\n"); -#endif -} - - -#ifdef NEMD - /* change momentum of each particle in top and bottom slab */ - fprintf (stderr, "Warning: NEMD is in SD not supported.\n"); -#endif - j=0; - for (c = 0; c < local_cells.n; c++) { - cell = local_cells.cell[c]; - p = cell->part; - np = cell->n; - for (i = 0; i < np; i++) { -#ifdef VIRTUAL_SITES - if (ifParticleIsVirtual(&p[i])) continue; -#endif - // write back of position and velocity data -#ifdef SD_USE_FLOAT - for (int d=0;d<3;d++){ - p[i].r.p[d] = pos[3*j+d]+box_l[d]*rint(p[i].r.p[d]/box_l[d]); - p[i].m.v[d] = velocity[3*j+d]; - //p[i].f.f[d] *= (0.5*time_step*time_step)/(*part).p.mass; - } -#else - for (int d=0;d<3;d++){ - p[i].r.p[d] = pos[3*j+d]+box_l[d]*rint(p[i].r.p[d]/box_l[d]); - } - memmove(p[i].m.v, &velocity[DIM*j], 3*sizeof(double)); -#endif - // somehow this does not effect anything, although it is called ... - for (int d=0;d<3;d++){ - p[i].f.f[d] *= (0.5*time_step*time_step)/(*p).p.mass; - } - for (int d=0;d skin2 ) resort_particles = 1; - } - } - free(pos); - free(force); - free(velocity); - announce_resort_particles(); -} - - -int sd_set_particles_apart(){ - { - double V=box_l[0]; - V*=box_l[1]; - V*=box_l[2]; - int N=sd_get_particle_num(); - double Vpart=4.*M_PI/3.*sd_radius*SQR(sd_radius)*N; - double phi = Vpart/V; - //fprintf(stderr,"info: sd_set_particles_apart: Volume fraction of particles is %f.\n",phi); - if (phi > M_PI/3./sqrt(2)){ - return -5; - } - if (phi > 0.7){ - return -4; - } - if (phi > 0.5){ - fprintf(stderr,"warning: sd_set_particles_apart: Volume fraction of particles is %f.\n\ -It could be difficult to set particles apart!\n",phi); - } - } - bool outer=false; - bool moved=false; - do { - outer=false; - for (int c = 0; c < local_cells.n; c++){ - Cell * cell = local_cells.cell[c]; - Particle * p = cell->part; - int np = cell->n; - bool inner=false; - do { - inner=false; - for (int i = 0; i < np; i++) { // only count nonVirtual Particles - for (int j = i+1; j SQR(2*sd_radius*(1+1e-5))); - assert(dr2 < SQR(2*sd_radius*(1+2e-5))); - inner=true; - moved=true; - } - } - } - } while (inner); - for (int cj =0; cj < local_cells.n;cj++){ - Cell * cellJ = local_cells.cell[cj]; - Particle * pj = cellJ->part; - int npj = cellJ->n; - for (int i = 0; i < np; i++) { - for (int j = (c==cj?i+1:0); j 0); - for (int d=0; d<3;d++){ - p[i].r.p[d] +=fac*dr[d]; - pj[j].r.p[d]-=fac*dr[d]; - } - outer=true; - moved=true; - } - for (int d=0;d<3;d++){ - assert(!isnan( pj[j].r.p[d])); - assert(!isnan( p[i].r.p[d])); - } - } - } - } - } - if (moved) fprintf(stderr,"+"); - } while (outer); - //fprintf(stderr,"set_apart suceeded "); - for (int c = 0; c < local_cells.n; c++){ - Cell * cell = local_cells.cell[c]; - Particle * p = cell->part; - int np = cell->n; - for (int cj =0; cj < local_cells.n;cj++){ - Cell * cellJ = local_cells.cell[cj]; - Particle * pj = cellJ->part; - int npj = cellJ->n; - for (int i = 0; i < np; i++) { - for (int j = (c==cj?i+1:0); j SQR(2*sd_radius*(1+1e-5)))); - } - } - } - } - return 0; -} - -int sd_get_particle_num(){ - int N=0; - for (int c = 0; c < local_cells.n; c++){ -#ifdef VIRTUAL_SITES - Cell * cell = local_cells.cell[c]; - Particle * p = cell->part; - int np = cell->n; - for (int i = 0; i < np; i++) { // only count nonVirtual Particles - if (!ifParticleIsVirtual(&p[i])) ++N; - } -#else - N += local_cells.cell[c]->n; -#endif - } - return N; -} - -void propagate_pos_bd(int N, real * pos, real * force, real * velocity){ - real self_mobility=1/(6*M_PI*sd_viscosity*sd_radius); - real scal_f = time_step*self_mobility; - real scal_r=sqrt(2*temperature*time_step*self_mobility); - for (int i=0;i<3*N;i++){ - pos[i]+=scal_f * force[i] - + scal_r * gaussian_random(); - } -} - -#endif /* SD */ diff --git a/src/core/integrate_sd.hpp b/src/core/integrate_sd.hpp deleted file mode 100644 index fc3ae150d39..00000000000 --- a/src/core/integrate_sd.hpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ -#ifndef __INTEGRATE_SD_H -#define __INTEGRATE_SD_H - -/** Average number of integration steps the verlet list has been re - used. */ -extern double verlet_reuse; - -extern double sd_viscosity; - -extern double sd_radius; - -extern int sd_seed[2]; -extern int sd_random_state[2]; - -extern double sd_random_precision; - -#ifdef SD - -#ifdef CUDA -# define HAVE_CUBLAS -# include -//#include -# ifdef __CUDA_ARCH__ -# if __CUDA_ARCH__ < 200 -# error "Stokesian Dynamics needs at least CUDA ComputeCapability 2.0" -# endif //#if __CUDA_ARCH__ < 200 -# endif //#ifdef __CUDA_ARCH__ -#else -# ifdef SD -# error "CUDA is not given!" -# error "StokesDynamics requires CUDA" -# endif -#endif - -#endif - -#define DIM (3) - -/** \file integrate_sd.hpp Stokes dynamics integrator. - * - * For more information see \ref integrate_sd.cpp "integrate_sd.cpp". -*/ - -/************************************************************/ -/** \name Exported Variables */ -/************************************************************/ -/*@{*/ - -#ifdef SD_USE_FLOAT -typedef float real; -#else -typedef double real; -#endif - -/** Switch determining which Integrator to use. */ -extern int integ_switch; - -/** incremented if a Verlet update is done, aka particle resorting. */ -extern int n_verlet_updates; - -/** Time step for the integration. */ -extern double time_step; -extern double time_step_half; -extern double time_step_squared; -extern double time_step_squared_half; - -/** Old time step needed for rescaling of forces. */ -extern double old_time_step; -/** Actual simulation time (only on MASTER NODE). */ -extern double sim_time; -/** Maximal interaction cutoff. */ -extern double max_cut; -/** Verlet list skin. */ -extern double skin; - -/** If non-zero, the particle data will be resorted before the next integration. */ -extern int resort_particles; -/** If non-zero, the forces will be recalculated before the next integration. */ -extern int recalc_forces; - -/*@}*/ - -/** \name Exported Functions */ -/************************************************************/ -/*@{*/ - -/** check sanity of integrator params */ -void integrator_sanity_checks_sd(); - -/** integrate with euler integrator. - \param n_steps number of steps to integrate. - */ -void integrate_sd(int n_steps); - -/** set particle apart so they dont overlap. - */ -int sd_set_particles_apart(); - -int sd_test(int size, int type); - -/*@}*/ -//#endif /* SD */ -#endif diff --git a/src/core/integrate_sd_cuda.cu b/src/core/integrate_sd_cuda.cu deleted file mode 100644 index e214420ea3a..00000000000 --- a/src/core/integrate_sd_cuda.cu +++ /dev/null @@ -1,1743 +0,0 @@ - -/* - Copyright (C) 2010,2011,2012,2016 The ESPResSo project - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ -#include "config.hpp" -#ifdef SD /* Terminates at end of file */ -//#define SD_PC - -/** \file integrate_sd_cuda.cu Stokes dynamics integrator. - * - * Here the calculation of the displacements is implemented. -*/ - - -#include -#include -#include "cuda_runtime.h" -#include "curand.h" -#include -// for std::swap: -// C++98: -#include -// C++11: -//#include - -#include "assert.h" -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_cuda_debug.hpp" -#include "integrate_sd_cuda_kernel.hpp" -#include "integrate_sd.hpp" -#include -//#include "errorhandling.hpp" -#include "global.hpp" - -#if defined(OMPI_MPI_H) || defined(_MPI_H) -#error CU-file includes mpi.h! This should not happen! -#endif - - -// global variables for usage in this file -cublasHandle_t cublas = NULL; -curandGenerator_t generator = NULL; -int reprint=-1; -/* *************************************************************************************************************** * - * ******************************************** implementation ******************************************** * - * *************************************************************************************************************** */ -/* *************************************************************************************************************** * - * ******* III MM MM PPP L EEEE MM MM EEEE NN N TTTTTTT AAA TTTTTTT III OOO NN N ******* * - * ******* I M M M M P P L E M M M M E N N N T A A T I O O N N N ******* * - * ******* I M M M PPP L EEE M M M EEE N N N T AAAAA T I O O N N N ******* * - * ******* I M M P L E M M E N N N T A A T I O O N N N ******* * - * ******* III M M P LLLL EEEE M M EEEE N NN T A A T III OOO N NN ******* * - * *************************************************************************************************************** */ -/* *************************************************************************************************************** */ - -/* *************************************************************************************************************** * - * ******************************************** HOST-Functions ******************************************** * - * *************************************************************************************************************** */ - -// this calls all the functions to: -// * generate the mobility matrix (farfield and nearfield) -// * compute the displacements -// * add the displacements to the positions -// PARAMTERS: -// box_l_h : the size of the box in x,y and z-direction, on the host (in) -// N : Number of particles (in) -// pos_h : position of the particles, simple* array on host (in and out) -// force_h : forces on the particles, simple* array on host (in) -// velo_h : velocities of the particles, simple* array on host (in and out) -// * : a simple array is e.g. [x_1, y_1, z_1, x_2, y_2, z_2, ...] -void propagate_pos_sd_cuda(real * box_l_h, int N,real * pos_h, real * force_h, real * velo_h){ - //printVectorHost(pos_h,3*N,"pos after call"); - real viscosity=sd_viscosity; - real radius =sd_radius; - if (viscosity < 0){ - std::cerr << "The viscosity for SD was not set\n"; - errexit(); - } - if (radius < 0){ - std::cerr << "The particle radius for SD was not set\n"; - errexit(); - } - if (time_step < 0){ - std::cerr << "The timestep was not set\n"; - errexit(); - } - - int lda=((3*N+31)/32)*32; - - //static cublasHandle_t cublas=NULL; - if (cublas==NULL){ - cublasCall(cublasCreate(&cublas)); - //magma_init(); - } - - real * box_l_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&box_l_d, 3*sizeof(real))); - cuda_safe_mem(cudaMemcpy(box_l_d,box_l_h,3*sizeof(real),cudaMemcpyHostToDevice)); - real * pos_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&pos_d, (DIM)*(((N+31)/32)*32)*sizeof(real))); - cuda_safe_mem(cudaMemcpy(pos_d,pos_h,N*DIM*sizeof(real),cudaMemcpyHostToDevice)); - //printVectorDev(pos_d,3*N,"pos after copy"); - real * force_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&force_d, DIM*(((N+31)/32)*32)*sizeof(real))); - cuda_safe_mem(cudaMemcpy(force_d,force_h,N*DIM*sizeof(real),cudaMemcpyHostToDevice)); - real * mobility_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&mobility_d, lda*N*3*sizeof(real))); - real * disp_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&disp_d, DIM*(((N+31)/32)*32)*sizeof(real))); - int myInfo_h[]={0,0,0}; - int * myInfo_d=NULL; - cuda_safe_mem(cudaMalloc((void**)&myInfo_d, 3*sizeof(int))); - cuda_safe_mem(cudaMemcpy(myInfo_d,myInfo_h,3*sizeof(int),cudaMemcpyHostToDevice)); - - sd_compute_displacement( pos_d, N, viscosity, radius, box_l_d, box_l_h, mobility_d, force_d, disp_d, myInfo_d); - - int numBlocks = (N+numThreadsPerBlock-1)/numThreadsPerBlock; - - // copy before rescaling to get velocities - cuda_safe_mem(cudaMemcpy(velo_h,disp_d,N*DIM*sizeof(real),cudaMemcpyDeviceToHost)); - // rescale displacements - real alpha=time_step; - cublasCall(cublasRscal( cublas, 3*N, &alpha, disp_d, 1)); -#ifdef SD_MAX_STEP - sd_real_integrate_prepare<<< numBlocks , numThreadsPerBlock >>>(pos_d , disp_d, box_l_d, sd_radius, N); -#endif - sd_real_integrate<<< numBlocks , numThreadsPerBlock >>>(pos_d , disp_d, box_l_d, sd_radius, N); - - // copy back the positions - cuda_safe_mem(cudaMemcpy(pos_h,pos_d,N*DIM*sizeof(real),cudaMemcpyDeviceToHost)); - // save the displacements as velocities (maybe somebody is interested) - //alpha=1/time_step; - //cublasCall(cublasRscal(cublas, DIM*N, &alpha, disp_d, 1)); - - - cuda_safe_mem(cudaFree((void*)box_l_d)); - cuda_safe_mem(cudaFree((void*)pos_d)); - cuda_safe_mem(cudaFree((void*)force_d)); - cuda_safe_mem(cudaFree((void*)mobility_d)); - cuda_safe_mem(cudaFree((void*)disp_d)); - cuda_safe_mem(cudaFree((void*)myInfo_d)); -} - - - -// calculate the farfield and the nearfield and add them -// PARAMETERS: -// cublas : a valid handle of cublas (in) -// r_d : position of the particles on the device, size 3*N (in) -// the form has to be [x_1, y_1, z_1, x_2, y_2, z_2, ...] -// N : Number of particles (in) -// viscosity : viscositiy of the fluid (in) -// radius : Particle radius (in) -// L_d : boxsize in x y and z-directions (in) -// total_mobility_d: matrix of the computed total mobility, size 3*3*N*N (in/out, is overwritten) -void sd_compute_displacement(const real * r_d, int N, real viscosity, real radius,const real * L_d,const real * L_h, - const real * total_mobility_d, const real * force_d, real * disp_d, int * myInfo_d) -{ -#ifndef SD_FF_ONLY - real det_prec=1e-3; -#endif - real ran_prec=sd_random_precision; - static real ran_prec_last=1e-3; - if ( ran_prec_last != ran_prec){ - ran_prec_last=ran_prec; - printf("\nSetting the precision for the random part to %e\n",ran_prec); - } - cudaThreadSynchronize(); // just for debugging - cudaCheckError("START"); - const int lda=((3*N+31)/32)*32; - const int numBlocks = (N+numThreadsPerBlock-1)/numThreadsPerBlock; - // sort particles in buckets -#ifndef SD_FF_ONLY - bool _use_buckets=false; - for (int d=0;d<3;d++){ - if (L_h[d] > 4*4*radius){ // check if we can make at least 4 boxes in any direction - //_use_buckets=true; - //#warning "buckets disabled - they are still buggy" - } - } - const bool use_buckets=_use_buckets; - // variables for the bucket version - real * bucket_size=NULL; - int * bucket_num=NULL; - int * particle_count=NULL; - int * particle_sorted_list=NULL; - int * particle_to_bucket_list=NULL; - int total_bucket_num=1; - int max_particles_per_bucket=N; - //const int bucket_size_factor=3; // is global - if ( use_buckets ){ - int bucket_num_h[3]; - real bucket_size_h[3]; - real bucket_volume=1; - real bucket_volume_with_border=1; - for (int d=0;d<3;d++){ - bucket_num_h[d]=L_h[d]/4/radius*bucket_size_factor; - //bucket_num_h[d]=bucket_num_h[d]>(2*bucket_size_factor+1)?bucket_num_h[d]:1; // buckets are propably only usefull if there are more than 3 ... - bucket_num_h[d]=bucket_num_h[d]>N?N:bucket_num_h[d]; // never more than N**3 buckets ... - total_bucket_num *= bucket_num_h[d]; - bucket_size_h[d]=L_h[d]/bucket_num_h[d]; - bucket_volume*=bucket_size_h[d]; - bucket_volume_with_border*=bucket_size_h[d]+2*radius; - } - max_particles_per_bucket=bucket_volume_with_border/(4*M_PI*radius*radius*radius/3)*0.74048; - max_particles_per_bucket=max_particles_per_bucket > N || max_particles_per_bucket < 0?N:max_particles_per_bucket; // there cannot be more than all particles in one bucket - cuda_safe_mem(cudaMalloc((void **) &bucket_size, sizeof(real)*3)); - cuda_safe_mem(cudaMemcpy(bucket_size,bucket_size_h, sizeof(real)*3, cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMalloc((void **) &bucket_num, sizeof(int) *3)); - cuda_safe_mem(cudaMemcpy(bucket_num,bucket_num_h, sizeof(int) *3, cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMalloc((void **) &particle_count, sizeof(int) *total_bucket_num)); - sd_set_value<<<32,32>>>(particle_count,total_bucket_num,0); - cuda_safe_mem(cudaMalloc((void **) &particle_to_bucket_list, sizeof(int) *N)); - cuda_safe_mem(cudaMalloc((void **) &particle_sorted_list, sizeof(int) *total_bucket_num*max_particles_per_bucket)); - sd_bucket_sort<<>>(r_d, bucket_size, bucket_num, N, particle_count, particle_sorted_list, - max_particles_per_bucket, total_bucket_num, particle_to_bucket_list); - } -#endif - - // compute the mobility Matrix - matrix mobility; - mobility.size = N*3; - mobility.ldd = ((N*3+31)/32)*32; - mobility.ldd_short = ((N+31)/32)*32; - real * &mobility_d=mobility.data; - cuda_safe_mem(cudaMalloc( (void**)&mobility_d, lda*DIM*N*sizeof(real) )); - assert(mobility_d); - sd_set_zero_matrix<<>>(mobility_d,mobility.size, mobility.ldd); - cudaThreadSynchronize(); // just for debugging - cudaCheckError(""); -#ifdef SD_NOT_PERIODIC - sd_compute_mobility_matrix<<< numBlocks , numThreadsPerBlock >>>(r_d,N,1./(6.*M_PI*viscosity*radius), radius, mobility_d); - cudaThreadSynchronize(); // just for debugging - cudaCheckError("compute mobility error"); -#else - real ew_prec=1e-8; - real L_min=L_h[0]; - L_min=min(L_min,L_h[1]); - L_min=min(L_min,L_h[2]); - real L_max=L_h[0]; - L_max=max(L_max,L_h[1]); - L_max=max(L_max,L_h[2]); - assert(ew_prec < 1); - real xi=2./L_min*sqrt(log(1./ew_prec)); - real kmax2=-4*xi*xi*log(ew_prec); - kmax2*=SQR(L_max/2./M_PI); - { - static int printed=0; - if ((printed%reprint == 1)){ - fprintf(stderr,"\nL_min is %f\n",L_min); - fprintf(stderr,"xi is %f\n",xi); - fprintf(stderr,"kmax is %e\n",sqrt(kmax2)); - fprintf(stderr,"selfmobility is %e\n",1./(6.*M_PI*viscosity*radius)); - //printed=1; - } - printed++; - } - //fprintf(stderr,"kmax is %e\n",sqrt(kmax2)); - sd_compute_mobility_matrix_real_short<<>>(r_d, N, 1./(6.*M_PI*viscosity*radius), radius, L_d, - mobility.data, L_min/2., xi, xi*radius, xi*radius*xi*radius*xi*radius); - //std::cerr << "kmax is " << ceil(sqrt(kmax2)) << std::endl; - sd_compute_mobility_matrix_wave_cpu(ceil(sqrt(kmax2))+1, (ceil(sqrt(kmax2))+1)*(ceil(sqrt(kmax2))+1), mobility, - radius, L_h, xi, 1./(6.*M_PI*viscosity*radius), ew_prec); - - //cudaCheckError(""); - //mobility.hash_data(); - if (mobility.wavespace){ - cudaThreadSynchronize(); - cudaCheckError("sd_compute_mobility_matrix_real_short"); - sd_compute_mobility_sines<<>>(r_d, N, mobility.wavespace->vecs, mobility.wavespace->num, - mobility.wavespace->sines, mobility.wavespace->cosines, - mobility.ldd_short); - cudaThreadSynchronize(); - cudaCheckError("sd_compute_mobility_sines"); - - //cuda_safe_mem(cudaMalloc( (void**)&mobility.dense, mobility.ldd*mobility.size*sizeof(real) )); - //cublasCall(cublasRcopy(cublas, mobility.size*mobility.ldd,mobility.data, 1, mobility.dense,1)); - if (false) { - sd_wavepart_addto_matrix<<>>(mobility.wavespace->num, mobility.wavespace->matrices, - mobility.wavespace->sines, mobility.wavespace->cosines, - mobility.ldd_short, N, - mobility.data, mobility.ldd); - cudaThreadSynchronize(); - cudaCheckError("add wavepart to matrix"); - mobility._free_wavespace(); - } -#ifdef SD_DEBUG - /* ********************************* - * ** Test to check wavespace ** - * ********************************* * - real * testvec=NULL; - cuda_safe_mem(cudaMalloc( (void**)&testvec, mobility.ldd*3*sizeof(real) )); - if (true){ - if (generator == NULL){ - curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - } - curandCall(curandGenerateNormalReal( generator, testvec, ((mobility.size+1)/2)*2, 0, sqrt(2.))); - } else { - sd_set_zero<<>>(testvec, mobility.size); - //real val=1e-20; - //for (int i = 0 ; i < mobility.size;i++){ - //if (i > 30) - //sd_set_value<<<1,1>>>(testvec+i,1,val); - //val*=10; - //} - sd_set_value<<<1,1>>>(testvec+6,3,1); - } - std::cout << "called "; - real * testout1 = testvec+mobility.ldd*1; - real * testout2 = testvec+mobility.ldd*2; - const real one=1; - sd_Rgemv(&one, mobility, testvec, testout1); - if (warnings > 2){ - mobility.print(); - } -#endif - //sd_wavepart_addto_matrix<<>>(mobility.wavespace->num, - sd_wavepart_addto_matrix<<>>(mobility.wavespace->num, mobility.wavespace->matrices, - mobility.wavespace->sines, mobility.wavespace->cosines, - mobility.ldd_short, N, - mobility.data, mobility.ldd); - - cudaCheckError(""); - mobility._free_wavespace(); - cudaCheckError(""); -#ifdef SD_DEBUG - sd_Rgemv(&one, mobility, testvec, testout2); - const real minusone=-1; - cublasCall(cublasRaxpy(cublas, mobility.size, &minusone, testout2, 1, testout1, 1)); - real erg; - cublasCall(cublasRnrm2(cublas, mobility.size, testout1, 1, &erg)); - if (erg > 0.01){ - printVectorDev(testvec, mobility.size, "in "); - printVectorDev(testout2, mobility.size, "correct out"); - printVectorDev(testout1, mobility.size, "difference "); - mobility.print(); - fflush(stdout); - } - assert(erg < 0.01); - */ -#endif - } - //mobility.printStats(); - if (!isSymmetricDev(mobility)){ - if (warnings > 2){ - printMatrixDev(mobility); - } - } - assert(isSymmetricDev(mobility)); - if (warnings > 20){ - //printMatrixDev(mobility); - //mobility.printWavespace(); - } -#endif - -#ifndef SD_FF_ONLY - // compute the resistance matrix - matrix resistance; - resistance.size = N*3; - resistance.ldd = ((N*3+31)/32)*32; - resistance.ldd_short = ((N+31)/32)*32; - int lda_short=resistance.ldd_short; - resistance.is_sparse=true; - const int max_part_in_lc=125; - cuda_safe_mem(cudaMalloc( (void**)&resistance.data, lda* 3* max_part_in_lc* sizeof(real) )); - cuda_safe_mem(cudaMalloc( (void**)&resistance.col_idx, lda_short* max_part_in_lc* sizeof(int) )); - cuda_safe_mem(cudaMalloc( (void**)&resistance.row_l, lda_short*sizeof(int) )); - - if (use_buckets){ - cudaThreadSynchronize(); - cudaCheckError("finding interaction error"); - sd_find_interacting_particles<<>>(r_d, L_d, N, resistance.col_idx, resistance.row_l, - particle_count, particle_sorted_list, bucket_size, - bucket_num, particle_to_bucket_list, 4*radius,total_bucket_num); - cudaThreadSynchronize(); - cudaCheckError("finding interaction error"); - } else { - cudaThreadSynchronize(); - cudaCheckError("finding interaction error"); - sd_find_interacting_particles<<>>(r_d, L_d, N, resistance.col_idx, resistance.row_l, - 4*radius); - cudaThreadSynchronize(); - cudaCheckError("finding interaction error"); - - } - //#ifdef SD_DEBUG - { - assert(sd_find_max(resistance.row_l,N) < max_part_in_lc ); - } - //#endif // SD_DEBUG - sd_compute_resistance_matrix_sparse<<>>(r_d, N, -1./(6.*M_PI*viscosity*radius), radius, L_d, - resistance.data, resistance.col_idx, resistance.row_l, myInfo_d); - /*} else { - resistance.is_sparse=false; - cuda_safe_mem(cudaMalloc( (void**)&resistance_d, lda*N*3*sizeof(real) )); - sd_set_zero_matrix<<>>(resistance.data,resistance.size,resistance.ldd); - cudaThreadSynchronize(); // debug - cudaCheckError("sd_set_zero"); - sd_compute_resistance_matrix<<< numBlocks , numThreadsPerBlock >>>(r_d,N,-1./(6.*M_PI*viscosity*radius), radius, L_d, resistance_d, myInfo_d); - //#warning "bug" - }*/ - cudaThreadSynchronize(); // we need both matrices to continue; - cudaCheckError("compute resistance error"); -#endif - //printVectorDev(resistance.data,resistance.size,"resis first line"); -#ifdef SD_DEBUG - assert(!hasAnyNanDev(mobility_d,N*3*lda)); -#ifndef SD_FF_ONLY - if (hasAnyNanDev(resistance_d,N*3*lda)){ - printPosDev(r_d,N,"positions"); - //printMatrixDev(resistance_d,lda, 3*N,"Resistance with nans ..."); - } - assert(!hasAnyNanDev(resistance_d,N*3*lda)); - assert(isSymmetricDev(resistance_d,lda,N*3)); -#endif -#endif - // initialize displacement - static int last_N=-1; - static real * last_det_disp=NULL; - if (N == last_N && last_det_disp!=NULL){ - cublasCall(cublasRcopy(cublas, N*3, last_det_disp,1, disp_d,1)); - } else { - sd_set_zero<<>>(disp_d,N*3); - } - //printVectorDev(r_d, 3*N,"pos"); -#ifdef SD_FF_ONLY - const real one=1; - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, 3*N, 3*N, &one, mobility_d, lda, force_d, 1, &zero, disp_d, 1)); - sd_Rgemv(&one, mobility,force_d,disp_d); -#else - real err_force = sd_iterative_solver(mobility, resistance, force_d, disp_d,det_prec,0, true); -#endif -#ifdef SD_DEBUG - if (hasAnyNanDev(disp_d,N*3)){ - printVectorDev(disp_d,N*3,"disp"); - printVectorDev(force_d,N*3,"force"); -#ifndef SD_FF_ONLY - //printMatrixDev(resistance_d,lda,N*3,"resistance produces nans?"); -#endif - } - assert(!hasAnyNanDev(disp_d,N*3)); -#endif - // save deterministic displacement - if (N > last_N){ - if (last_det_disp!= NULL){ - cuda_safe_mem(cudaFree((void*)last_det_disp)); - last_det_disp=NULL; - } - } - if (last_det_disp==NULL){ - cuda_safe_mem(cudaMalloc( (void **) &last_det_disp, N*3*sizeof(real))); assert(last_det_disp !=NULL); - } - cublasCall(cublasRcopy(cublas, N*3, disp_d, 1, last_det_disp,1)); - last_N=N; - { - static int printed=0; - if (!printed){ - ;//printMatrixDev(mobility); - } - } - // brownian part - if (temperature > 0 ){ -#ifndef SD_FF_ONLY - int myInfo_h[3]; - cuda_safe_mem(cudaMemcpy(myInfo_h,myInfo_d,3*sizeof(int),cudaMemcpyDeviceToHost)); -#else - const int myInfo_h[3]={0,0,0}; -#endif - int N_ldd = ((N+31)/32)*32; - int num_of_rands = N_ldd*myInfo_h[2]*2*DIM+N_ldd*DIM; // has to be even! - if (myInfo_h[0]){ - fprintf(stderr,"We had %d overlapping particles!\n",myInfo_h[0]); - } - real * brownian_force_nf = NULL; - if (myInfo_h[2]){ - cuda_safe_mem(cudaMalloc( (void**)&brownian_force_nf, (3*N)*sizeof(real) )); assert(brownian_force_nf != NULL); - } - real * brownian_force_ff = NULL; - cuda_safe_mem(cudaMalloc( (void**)&brownian_force_ff, (3*N)*sizeof(real) )); assert(brownian_force_ff != NULL); - real * gaussian = NULL; - cuda_safe_mem(cudaMalloc( (void**)&gaussian, (num_of_rands)*sizeof(real) )); assert(gaussian != NULL); - real * gaussian_ff = gaussian; - real * gaussian_nf = gaussian+N_ldd*DIM; - unsigned long long * sd_random_generator_offset = (unsigned long long *)sd_random_state; - unsigned long long * sd_seed_ull = (unsigned long long *)sd_seed; - static unsigned long long sd_random_generator_offset_last= *sd_random_generator_offset; - if (generator == NULL){ - curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - curandCall(curandSetPseudoRandomGeneratorSeed(generator, *sd_seed_ull)); - curandCall(curandSetGeneratorOrdering( generator, CURAND_ORDERING_PSEUDO_BEST)); - curandCall(curandSetGeneratorOffset( generator, *sd_random_generator_offset)); - } - if (* sd_random_generator_offset != sd_random_generator_offset_last){ - curandCall(curandSetGeneratorOffset( generator, *sd_random_generator_offset)); - } - //#ifdef FLATNOISE - // this does not work yet: - //curandCall(curandGenerateUniformReal(generator, gaussian_d, num_of_rands, 0, sqrt(24.*temperature/time_step))); - //#else - sd_random_generator_offset[0]+=num_of_rands/2; - curandCall(curandGenerateNormalReal(generator, gaussian, num_of_rands, 0, sqrt(4*temperature/time_step))); - sd_random_generator_offset_last=sd_random_generator_offset[0]; - //#endif - if (myInfo_h[2]){ - sd_set_zero<<<64,192>>>(brownian_force_nf,3*N); - sd_compute_brownian_force_nearfield<<>>(r_d, gaussian_nf, N, L_d, radius, - 1./(6.*M_PI*viscosity*radius), brownian_force_nf); - }// end of near field - int size=N*3; - real alpha=1/sqrt(2.); - // in the case of FF_ONLY this computes directly the displacement - real E_cheby=sd_compute_brownian_force_farfield(mobility, gaussian_ff, ran_prec, brownian_force_ff); - //real bff_nrm; - //cublasCall(cublasRnrm2(cublas, size, brownian_force_ff, 1, &bff_nrm)); - if (E_cheby > 10*ran_prec){ - E_cheby=sd_compute_brownian_force_farfield(mobility, gaussian_ff, ran_prec, brownian_force_ff); - //cublasCall(cublasRnrm2(cublas, size, brownian_force_ff, 1, &bff_nrm)); - if (warnings > 1) fprintf(stderr, "Recalculating the Chebyshev-polynome\n"); - } - if ((E_cheby>100*ran_prec && warnings) || (E_cheby > 10*ran_prec && warnings > 1) ){ - fprintf(stderr, "The error of the Chebyshev-approximation was %7.3f%%\n",E_cheby*100); - //printVectorDev(mobility.wavespace->sines,mobility.wavespace->max*mobility.ldd_short,"sines"); - //printVectorDev(mobility.wavespace->matrices,mobility.wavespace->max*6,"matrices"); - } - //printVectorDev(gaussian_ff,size, "gaussian"); - //printVectorDev(brownian_force_ff, size, "brownian force"); - //printVectorDev(mobility_d, 1, "mobility"); - //printf("E_Cheby is %e and bff is %e\n",E_cheby,bff_nrm); - //printVectorDev(brownian_force_ff,size,"FBff: "); - //printVectorDev(brownian_force_nf,size,"FBnf: "); - real * brownian_disp; -#ifndef SD_FF_ONLY - cuda_safe_mem(cudaMalloc( (void**)&brownian_disp, size*sizeof(real) )); assert(brownian_disp != NULL); - sd_set_zero<<<32,32>>>(brownian_disp,size); - const real one=1; - if (myInfo_h[2]){ - cublasCall(cublasRaxpy(cublas, size, &one, brownian_force_nf, 1, brownian_force_ff, 1 )); - } - sd_iterative_solver(mobility, resistance, brownian_force_ff, brownian_disp, ran_prec,0, false);//(E_cheby/2+err_force/2)*1e-3 -#else - //real one=1, zero=0; - //cuda_safe_mem(cudaMalloc( (void**)&brownian_disp, size*sizeof(real) )); assert(brownian_disp != NULL); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, mobility_d, lda, brownian_force_ff, 1, &zero, brownian_disp, 1)); - brownian_disp = brownian_force_ff; -#endif - //real tmp; - //printVectorDev(brownian_disp, size, "brownian disp"); - //printVectorDev(disp_d, size, "det disp"); - //cublasCall(cublasRnrm2(cublas, size, brownian_disp, 1, &tmp)); - //printf("Brownian disp is: %e ",tmp); - //cublasCall(cublasRnrm2(cublas, size, disp_d, 1, &tmp)); - //printf("Deterministic disp is: %e \n",tmp); - - cublasCall(cublasRaxpy( cublas, size, &alpha, brownian_disp, 1, disp_d, 1 )); -#ifndef SD_FF_ONLY - cuda_safe_mem(cudaFree((void*)brownian_disp)); -#endif - cuda_safe_mem(cudaFree((void*)brownian_force_ff)); - if (myInfo_h[2]){ - cuda_safe_mem(cudaFree((void*)brownian_force_nf)); - } - cuda_safe_mem(cudaFree((void*)gaussian)); - }// end of brownian motion - cudaCheckError("brownian motion error"); - - // free everything -#ifndef SD_FF_ONLY - if (use_buckets) { - cuda_safe_mem(cudaFree((void*)bucket_size)); - cuda_safe_mem(cudaFree((void*)bucket_num)); - cuda_safe_mem(cudaFree((void*)particle_count)); - cuda_safe_mem(cudaFree((void*)particle_to_bucket_list)); - cuda_safe_mem(cudaFree((void*)particle_sorted_list)); - } -#endif - cudaCheckError("in mobility"); -} - -/// this calls an iterative solver to solve the problem: -/// disp * (1+resistance*mobility) = mobility_d * force_d -/// and returnes \param disp -/// \param mobility and \param resistance are square matrizes with \param size and ldd <((size+31)/32)*32> -/// \param force and \param disp are vectors of \param size -real sd_iterative_solver(const matrix& mobility, const matrix& resistance, const real * force, real * disp, real rel_tol, real abs_tol, bool recalc) -{ - int size=mobility.size; - if (abs(abs_tol) > 1e-8){ - if (warnings > 1) fprintf(stderr,"Solving for brownian forces.\n"); - } else { - if (warnings > 1) fprintf(stderr,"Solving for interparticle forces.\n"); - } -#ifdef SD_DEBUG - assert(!hasAnyNanDev(mobility.data,mobility.size*mobility.ldd)); - assert(!hasAnyNanDev(resistance.data,resistance.size*mobility.ldd)); - assert(!hasAnyNanDev(force,size)); - assert(!hasAnyNanDev(disp,size)); -#endif -#ifdef SD_PC - static real * mat_a = NULL; - if (mat_a==NULL){ - cuda_safe_mem(cudaMalloc( (void**)&mat_a, lda*size*sizeof(real) )); - recalc=true; - } assert(mat_a != NULL); - if (recalc){ - sd_set_zero_matrix<<<32,192>>>(mat_a,size,lda); - } -#endif - real * mob_force=NULL; - cuda_safe_mem(cudaMalloc( (void**)&mob_force, size*sizeof(real) )); assert(mob_force !=NULL); - real * result_checker=NULL; - cuda_safe_mem(cudaMalloc( (void**)&result_checker, size*sizeof(real) )); assert(result_checker !=NULL); - // vars for cuBLAS calls - real alpha=1; -#ifdef SD_PC - // mat_a = (1+resistance*mobility) - if (recalc){ - assert(!resistance.is_sparse); - assert(!mobility.is_sparse); - assert(mobility.wavepart==NULL); - cublasCall(cublasRgemm(cublas,CUBLAS_OP_N,CUBLAS_OP_N, size , size ,size, &alpha, mobility.data, lda,resistance.data, lda, &beta,mat_a, lda)); - sd_add_identity_matrix<<<64,192>>>(mat_a,size,lda); - } -#endif - // mob_force = mobility * force - //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility.data, lda, force, 1, &beta, mob_force, 1)); - sd_Rgemv(&alpha, mobility, force,mob_force); -#ifdef SD_DEBUG - assert(!hasAnyNanDev(mob_force,size)); -# ifdef SD_PC - assert(!hasAnyNanDev(mat_a,size*lda)); -# endif -#endif - int info; - real res; - //printVectorDev((real *)force,size,"Kraft"); - //printMatrixDev((real *)mobility, lda, size, "mobility"); - //printMatrixDev((real* )mat_a,lda,size,"A"); - //printVectorDev(disp,6,"before"); - static int last_solv_info=4; - if (last_solv_info>1){ - sd_set_zero<<<16,192>>>(disp,size); - } - //info = sd_bicgstab_solver(cublas ,size, mat_a,lda, mob_force, rel_tol, abs_tol, 10*size+100, disp, &res); -#ifdef SD_PC - info = sd_gmres_solver(cublas ,size, mat_a, lda, mob_force, rel_tol, abs_tol, size, disp, &res); - //info = sd_bicgstab_solver(cublas ,size, mat_a, lda, mob_force, rel_tol, abs_tol, size, disp, &res); -#else - info = sd_gmres_solver(mobility, resistance, mob_force, rel_tol, abs_tol, size, disp, &res); - //info = sd_bicgstab_solver(cublas ,size, mobility, resistance, lda, mob_force, rel_tol, abs_tol, size, disp, &res); -#endif - last_solv_info=info; - //printVectorDev(disp,6,"after"); - // compary to expected result - //cuda_safe_mem(cudaMemcpy(mat_a, mat_a_bak, lda*size*sizeof(real),cudaMemcpyDeviceToDevice)); - - if (info != 0){ - if (info == 1){ - if (warnings>1) fprintf(stderr, "Iterative solver did not fully converge ... the residuum was %6e\nWe will continue anyway ...\n",res); - } - else{ // info == 2 || info == 4 - if (info == 1){ - if (warnings>1) fprintf(stderr, "Iterative solver did not fully converge ... the residuum was %6e\nWe will continue anyway ...\n",res); - } - else if (info == 2){ - if(warnings) fprintf(stdout, "Iterative solver failed ... the residuum was %6e\nWe will continue but the results may be problematic ...\n",res); - } - } - // dgetrs is not better - the contrary: results are worse ... - /*int ipiv[size]; - magma_dgetrf_gpu( size, size,mat_a, lda, ipiv, &info); - assert(info==0); - magma_dgetrs_gpu('N', size, 1, - mat_a, lda, ipiv, - disp, size, &info); - assert(info==0); - // compary to expected result - cuda_safe_mem(cudaMemcpy(mat_a, mat_a_bak, lda*size*sizeof(real),cudaMemcpyDeviceToDevice)); - cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mat_a, lda, disp, 1, &beta, result_checker, 1)); - alpha=-1; - cublasCall(cublasRaxpy( cublas, size, &alpha, mob_force, 1, result_checker, 1)); - alpha=1; - cublasCall(cublasRdot( cublas, size, result_checker, 1, result_checker, 1,&res)); - if (res > 1e-1){ - fprintf(stderr, "All methods failed :(. The residuum from getrs was %e\n",res); - //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mat_a, lda, disp, 1, &beta, result_checker, 1)); - //printVectorDev(mob_force, size, "mob_force"); - //printVectorDev(result_checker, size, "result_checker"); - //printVectorDev(disp, size, "disp"); - //printMatrixDev((real *)mobility,lda,size,"mobility"); - //printMatrixDev((real *)resistance,lda,size,"res"); - //printMatrixDev((real *)mat_a,lda,size,"mat_a"); - }*/ - //magma_int_t magma_dgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, - // double *dA, magma_int_t ldda, magma_int_t *ipiv, - // double *dB, magma_int_t lddb, magma_int_t *info); - } -#ifdef SD_DEBUG - assert(!hasAnyNanDev(disp,size)); -#endif - //assert(info==0); - //cuda_safe_mem(cudaFree((void*)mat_a)); - //cuda_safe_mem(cudaFree((void*)mat_a_bak)); - cuda_safe_mem(cudaFree((void*)mob_force)); - cuda_safe_mem(cudaFree((void*)result_checker)); - return res; -} - -// BICGSTAB-Solver -// GMRes works better, therefore no support for BiCGStab -// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister -// this solves A*x=b -// cublas a handle for cublas -// size the size n of the matrix -// A the given n*n matrix = A1*A2+1 (in) -// A1 the two matrices -// A2 -// lda the leading demension of A(1,2) -// b the given solution vector (in) -// tol requested tolerance of the solution -// maxit maximum number of iterations -// x the requested solution with an initial guess (in/out) -// returns 0 on success, else error code -/*#define sd_bicgstab_solver_exit() cuda_safe_mem(cudaFree((void*)r0)); \ - cuda_safe_mem(cudaFree((void*)r));cuda_safe_mem(cudaFree((void*)p)); \ - cuda_safe_mem(cudaFree((void*)v));cuda_safe_mem(cudaFree((void*)t)); \ - cuda_safe_mem(cudaFree((void*)tmp)); \ - res[0] = normr; \ - if (warnings > 1) printf("BICSTAB used %d iterations.\n", iteration); \ - if (tolb*1.01 > normr) { return 0;} \ - if (tolb*100 > normr) { return 1;} \ - if (initnorm*1e10 > normr) { return 2;} \ - else { return 4;} -#ifdef SD_PC -int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A, int lda, - real * b, real tol,real abs_tol, int maxit, real * x, real * res){ -#else -int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A1, const real * A2, int lda, - real * b, real tol,real abs_tol, int maxit, real * x, real * res){ -#endif - // vector malloc - real * r0=NULL; - cuda_safe_mem(cudaMalloc( (void**)&r0, size*sizeof(real) )); assert(r0 != NULL); - real * r=NULL; - cuda_safe_mem(cudaMalloc( (void**)&r, size*sizeof(real) )); assert(r != NULL); - real * p=NULL; - cuda_safe_mem(cudaMalloc( (void**)&p, size*sizeof(real) )); assert(p != NULL); - real * v=NULL; - cuda_safe_mem(cudaMalloc( (void**)&v, size*sizeof(real) )); assert(v != NULL); - real * t=NULL; - cuda_safe_mem(cudaMalloc( (void**)&t, size*sizeof(real) )); assert(t != NULL); - real * tmp=NULL; - cuda_safe_mem(cudaMalloc( (void**)&tmp, 2*lda*sizeof(real) )); assert(tmp != NULL); - //printMatrixDev(A,lda,size,"mat_a"); - //printVectorDev(b,size,"force"); -#ifdef SD_PC - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, b, 1)); -#endif - // constants - real eps; - if (sizeof(real) == sizeof(double)){ - eps = 1e-15; - } else { - eps = 1e-7; - } - eps = min(eps,tol*1e-2); - // other variables - real alpha=1; - real beta=0; - real tolb; - // compute the norm of b - real normb; - cublasCall(cublasRnrm2( cublas, size, b, 1, &normb)); - // compute the tolerance we want to achieve - tolb=tol*normb+abs_tol; - // r0 = b-A*x - alpha = -1; -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, r0, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r0, 1)); -#else - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp, 1)); - alpha=1; - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, r0, 1)); - alpha=-1; - cublasCall(cublasRaxpy(cublas, size, &alpha, x, 1, r0, 1)); -#endif - alpha = 1; - cublasCall(cublasRaxpy(cublas, size, &alpha, b, 1, r0, 1)); - // r = r0 - cublasCall(cublasRcopy(cublas, size,r0,1,r, 1)); - // rr0 = r*r0 - real rr0; - cublasCall(cublasRdot( cublas, size, r0, 1, r0, 1, &rr0)); - // p =r - cublasCall(cublasRcopy(cublas, size,r0,1,p, 1)); - // normr=norm(r) - real normr=sqrt(rr0); - int iteration=0; - real lastnorm=normr; - real initnorm=normr; - // check for conversion or max iterations - while (iteration < maxit && normr >= tolb){ - // v=A*p -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, p, 1, &beta, v, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, v, 1)); -#else - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, p, 1, &beta, tmp, 1)); - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, v, 1)); - cublasCall(cublasRaxpy(cublas, size, &alpha, p, 1, v, 1)); -#endif - // vr0 = v*r0 - real vr0; - cublasCall(cublasRdot( cublas, size, v, 1, r0, 1, &vr0)); - if (fabs(vr0) < eps || rr0 == 0){ - if (fabs(vr0) < eps){ - if (warnings > 1) fprintf(stderr, "BICGSTAB break-down.\n"); - }else{ - if (warnings > 1) fprintf(stderr, "BICGSTAB solution stagnates.\n"); - } - sd_bicgstab_solver_exit(); - } - // alpha = rr0/vr0 - real myAlpha=rr0/vr0; - real minusMyAlpha = -myAlpha; - // s = r - alpha v - //cublasCall(cublasRcopy(cublas, size,r,1,s, 1)); - cublasCall(cublasRaxpy(cublas, size, &minusMyAlpha, v, 1, r, 1)); //s->r - // t = A * s -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, r, 1, &beta, t, 1));// s->r - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, t, 1)); -#else - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, r, 1, &beta, tmp, 1));// s->r - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, t, 1));// s->r - cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, t, 1)); -#endif - // ts = s * t - real ts; - cublasCall(cublasRdot( cublas, size, t, 1, r, 1, &ts));// s->r - // tt = t * t - real tt; - cublasCall(cublasRdot( cublas, size, t, 1, t, 1, &tt)); - if (abs(tt) 1) fprintf(stderr, "Exit: abs(tt) 1) fprintf(stderr, "BICGSTAB break-down.\n"); - sd_bicgstab_solver_exit(); - } - // omega = ts/tt - real myOmega=ts/tt; - // x = x + alpha p + omega s - cublasCall(cublasRaxpy(cublas, size, &myAlpha, p, 1, x, 1)); - cublasCall(cublasRaxpy(cublas, size, &myOmega, r, 1, x, 1)); - // copyback of s to r - // r = s - omega t - real minusMyOmega=-1*myOmega; - cublasCall(cublasRaxpy(cublas, size, &minusMyOmega, t, 1, r, 1)); - //myOmega*=-1; - // r1r0 = r * r0 - real r1r0; - cublasCall(cublasRdot( cublas, size, r, 1, r0, 1, &r1r0)); - // beta = (alpha * r1r0 ) / (omega rr0) - real myBeta = (myAlpha*r1r0)/(myOmega*rr0); - if (abs(myBeta)>1/eps){ - if (warnings > 1) fprintf(stderr,"Exit: abs(myBeta)<1/eps\n"); - sd_bicgstab_solver_exit() - } - // p = r + beta ( p - omega v)= beta p + r - beta omega v - cublasCall(cublasRscal(cublas, size, &myBeta, p, 1)); - cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, p, 1)); - alpha=-myBeta*myOmega; - cublasCall(cublasRaxpy(cublas, size, &alpha, v, 1, p, 1)); - alpha=1; - rr0=r1r0; - cublasCall(cublasRnrm2( cublas, size, r, 1, &normr)); - if (lastnorm*sqrt(eps) > normr){ // restart - cublasCall(cublasRcopy(cublas, size,b,1,r, 1)); - alpha=-1;beta=1; // r <-- r-A*x -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, r, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r, 1)); -#else // TODO - alpha=1, beta=0; - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp+lda, 1));// s->r - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp+lda, 1, &beta, tmp, 1));// s->r - cublasCall(cublasRaxpy(cublas, size, &alpha, x, 1, tmp, 1)); - alpha=-1; - cublasCall(cublasRaxpy(cublas, size, &alpha, tmp, 1, r, 1)); -#endif - alpha= 1;beta=0; - cublasCall(cublasRdot( cublas, size, r, 1, r, 1, &rr0)); - normr=sqrt(rr0); - lastnorm = normr; - // r = r0 - cublasCall(cublasRcopy(cublas, size,r,1,r0, 1)); - // p =r - cublasCall(cublasRcopy(cublas, size,r,1,p, 1)); - } - if (iteration%50000 == 0){ // enable debugging by setting this to a lower value - real realnorm; - {// recalculate normr - alpha=-1;beta=0; -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, tmp, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, tmp, 1)); -#else - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp+lda, 1));// s->r - alpha=1; - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp+lda, 1, &beta, tmp, 1));// s->r - alpha=-1; - cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, tmp, 1)); -#endif - alpha=1; - cublasCall(cublasRaxpy(cublas, size, &alpha, b,1,tmp, 1)); - alpha= 1;beta=0; - cublasCall(cublasRnrm2(cublas, size, tmp, 1, &realnorm)); - } - fprintf(stderr," Iteration: %6d Residuum: %12f RealResiduum: %12f\n",iteration, normr, realnorm); - } - iteration++; - if (initnorm*1e10 < normr){ // somehow our solution explodes ... - if (warnings) fprintf(stderr, "BICGSTAB did not converge, residuum exploded. Aborting.\n"); - sd_bicgstab_solver_exit(); - } - } - res[0]=normr; - if (normr > tolb*1.01){ - if (warnings) fprintf(stderr, "BICGSTAB solution did not converge after %d iterations. Error was %e1 %% to high.\n",iteration,(normr/tolb-1)*100); - } - //fprintf(stderr, "BICGSTAB solution did converge after %d iterations.\n",iteration); - sd_bicgstab_solver_exit(); -} -*/ - -// GMRes-Solver -// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister -// this solves A*x=b -// cublas a handle for cublas -// size the size n of the matrix -// A the given n*n matrix (in) -// lda the leading demension of A -// b the given solution vector (in) -// tol requested tolerance of the solution -// maxit maximum number of iterations -// x the requested solution with an initial guess (in/out) -// returns 0 on success, else error code -#define sd_gmres_solver_exit() cuda_safe_mem(cudaFree((void*)r0)); \ - cuda_safe_mem(cudaFree((void*)v));cuda_safe_mem(cudaFree((void*)w)); \ - cuda_safe_mem(cudaFree((void*)tmp)); res[0] = normr; \ - if (warnings > 1) printf("GMRes used %d iterations.\n", iteration); \ - if (tolb*1.01 >= normr) { return 0;} \ - printf("tolb: %e, normr: %e, eps: %e \n",tolb, normr, eps); \ - if (tolb*100 > normr) { return 1;} \ - if (initnorm*1e10 >= normr) { return 2;} \ - else { return 4;} -#ifdef SD_PC -int sd_gmres_solver(int size, const real * A,int lda, const real * b, real tol,real abs_tol, int maxit, real * x, real * res){ -#else -int sd_gmres_solver(const matrix & A1, const matrix & A2 , const real * b, real tol,real abs_tol, int maxit, real * x, real * res){ -#endif - int size=A1.size; - int lda =A1.ldd; - const int m=60; - // vector malloc device - real * r0=NULL; - cuda_safe_mem(cudaMalloc( (void**)&r0, size*sizeof(real) )); assert(r0 != NULL); - real * v=NULL; - cuda_safe_mem(cudaMalloc( (void**)&v, m*lda*sizeof(real) )); assert(v != NULL); - real * w=NULL; - cuda_safe_mem(cudaMalloc( (void**)&w, size*sizeof(real) )); assert(w != NULL); - real * tmp=NULL; - cuda_safe_mem(cudaMalloc( (void**)&tmp, size*sizeof(real) )); assert(tmp != NULL); - // malloc host - const real minusone=-1; - const real one=1; - real gamma[m+1]; - real alpha[m]; - real h[(m+1)*(m+1)]; - real s[m]; - real c[m]; -#ifdef SD_DEBUG - for (int i=0;i 1) printf("inital normr: %e\n",normr); - int iteration=0; - if (normr==0){ - sd_gmres_solver_exit(); - } - if (normb==0){ - sd_set_zero<<<192,192>>>(x,size); - normr=0; - sd_gmres_solver_exit(); - } - do { - cublasCall(cublasRcopy(cublas, size, r0, 1, v ,1)); - gamma[0]=normr; - real igamma=1/normr; - cublasCall(cublasRscal(cublas, size, &igamma, v, 1)); - int j; - for (j=0;j tolb ;j++){ -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A, lda, v+lda*j, 1, &zero, w, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, w, 1)); -#else - sd_Rgemv(&one, A2, v+lda*j, tmp); - sd_Rgemv(&one, A1, tmp, w); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A2, lda, v+lda*j, 1, &zero, tmp, 1)); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A1, lda, tmp, 1, &zero, w, 1)); - cublasCall(cublasRaxpy(cublas, size, &one, v+lda*j, 1, w, 1)); -#endif - for (int i=0;i=0;i--){ - real tmp=gamma[i]; - for (int k=i+1;k 1) printf("Iteration: %d, j was %d, residiuum: %e\n", iteration, j, normr ); - iteration+=j; - { - real newnormr; - // r0 = b-A*x -#ifdef SD_PC - cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &minusone, A, lda, x, 1, &zero, r0, 1)); - cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r0, 1)); -#else - sd_Rgemv(&minusone, A2, x, tmp); - sd_Rgemv(&one, A1, tmp, r0); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &minusone, A2, lda, x, 1, &zero, tmp, 1)); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A1, lda, tmp, 1, &zero, r0, 1)); - cublasCall(cublasRaxpy(cublas, size, &minusone, x, 1, r0, 1)); -#endif - cublasCall(cublasRaxpy(cublas, size, &one, b, 1, r0, 1)); - cublasCall(cublasRnrm2(cublas, size, r0, 1, &newnormr)); - if ((newnormr/normr > 1.1 || newnormr/normr < 0.9) && newnormr > sqrt(eps)){ - if (warnings) printf("Our norm changed strangely: Expected value: %e, real value: %e - eps %e- Bug in GMRes Solver?\n",normr,newnormr, eps); - } - normr=newnormr; - } - } while (iteration < maxit && normr > tolb); - sd_gmres_solver_exit(); -} - -// calculates the largest and snalles eigenvalue of the matrix -// size : size of the eigenvector / the matrix (IN) -// mobility_d : handle of the mobility matrix (on the device) (IN) -// lambda_min : smalles eigenvalue (OUT) -// lambda_max : largest eigenvalue (OUT) -void calculate_maxmin_eigenvalues(const matrix & mobility, real * lambda_min, real * lambda_max, real tol){ - int size=mobility.size; - int lda = ((size+31)/32)*32; - int maxit=max(500,size); - int IDO; - char BMAT='I'; // standard eigenvalue problem - char WHICH[]="LR"; // start with largest eigenvalue - int NEV = 1; // only one eigenvalue - // TUNING: these could be adjusted? - real TOL=tol; - if (sizeof(double) == sizeof(real)){ - TOL=max(1e-12,TOL); - } else { - TOL=max(1e-6,TOL); - } - // TUNING: make some tests to find good value ... - int NCV=min(size, 6); // must be at least 3, but a bit bigger should be better ... - int LDV=lda; - int LWORKL=3*NCV*(NCV + 2); - int mode=1; - int IPARAM[11] = {1,0,maxit,1,0,0,mode,0,0,0,0}; - int IPNTR[14]; - int INFO=0; - real RESID[size]; - real V[LDV*NCV]; - real WORKD[3*size]; - real WORKL[LWORKL]; - real * vec_in_d; - real * vec_out_d; - cuda_safe_mem(cudaMalloc((void**)&vec_in_d , lda*sizeof(real))); - cuda_safe_mem(cudaMalloc((void**)&vec_out_d, lda*sizeof(real))); - for (int minmax=0;minmax<2;minmax++){ - IDO=0; - if (minmax){ - sprintf(WHICH,"SR"); - INFO=0; - IPARAM[2]=maxit; - TOL=sqrt(tol); - } - while (IDO != 99){ - //dnaupd_(&IDO,&BMAT,&N,WHICH,&NEV,&TOL,RESID.memptr(),&NCV,V.memptr(),&LDV,IPARAM,IPNTR,WORKD,WORKL,&LWORKL,&INFONAUP); - rnaupd(&IDO,&BMAT,&size, WHICH, &NEV, &TOL, RESID, &NCV, V, &LDV, IPARAM, IPNTR, WORKD, WORKL, &LWORKL, &INFO); - switch (IDO){ - case 1: - cuda_safe_mem(cudaMemcpy(vec_in_d,WORKD+IPNTR[0]-1,size*sizeof(real),cudaMemcpyHostToDevice)); - { - const real one=1; - //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, vec_in_d, 1, &zero, vec_out_d, 1)); - sd_Rgemv(&one, mobility, vec_in_d, vec_out_d); - } - cuda_safe_mem(cudaMemcpy(WORKD+IPNTR[1]-1,vec_out_d,size*sizeof(real),cudaMemcpyDeviceToHost)); - break; - case -1: - case 2: - case 3: - case 4: - fprintf(stderr,"Error in %s l. %d: unexpected work from rnaupd: %d: Not Implemented!\n",__FILE__,__LINE__,IDO); - break; - case 99: //we are done - break; - default: - fprintf(stderr,"Error in %s l. %d: unexpected work from rnaupd: %d: Not Understood!\n",__FILE__,__LINE__,IDO); - break; - } - } - if (warnings > 1) fprintf(stderr,"calculating eigenvalue needed %d iterations and %d gemv operations (tolerance is %e, EW is %e).\n" - ,IPARAM[2], IPARAM[8], TOL,WORKL[IPNTR[5]-1]); - if (INFO){ - if (INFO == 1 && warnings > 1){ - fprintf(stderr,"Maximum iterations taken to find Eigenvalues.\n"); - } else if( INFO > 1 && warnings){ - fprintf(stderr,"Unexpected return value in %s l. %d from rnaupd_: %d (debug info: %d %e)\n",__FILE__,__LINE__,INFO, minmax, V[0]); - } - /* - c INFO Integer. (INPUT/OUTPUT) - c If INFO .EQ. 0, a randomly initial residual vector is used. - c If INFO .NE. 0, RESID contains the initial residual vector, - c possibly from a previous run. - c Error flag on output. - c = 0: Normal exit. - c = 1: Maximum number of iterations taken. - c All possible eigenvalues of OP has been found. IPARAM(5) - c returns the number of wanted converged Ritz values. - c = 2: No longer an informational error. Deprecated starting - c with release 2 of ARPACK. - c = 3: No shifts could be applied during a cycle of the - c Implicitly restarted Arnoldi iteration. One possibility - c is to increase the size of NCV relative to NEV. - c See remark 4 below. - c = -1: N must be positive. - c = -2: NEV must be positive. - c = -3: NCV-NEV >= 2 and less than or equal to N. - c = -4: The maximum number of Arnoldi update iteration - c must be greater than zero. - c = -5: WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI' - c = -6: BMAT must be one of 'I' or 'G'. - c = -7: Length of private work array is not sufficient. - c = -8: Error return from LAPACK eigenvalue calculation; - c = -9: Starting vector is zero. - c = -10: IPARAM(7) must be 1,2,3,4. - c = -11: IPARAM(7) = 1 and BMAT = 'G' are incompatable. - c = -12: IPARAM(1) must be equal to 0 or 1. - c = -9999: Could not build an Arnoldi factorization. - c IPARAM(5) returns the size of the current Arnoldi - c factorization. - */ - } - if (WORKL[IPNTR[5]-1]<0 && TOL > 1e-3){ - minmax--; - TOL=min(tol,1e-4); - tol=min(tol*tol, 1e-8); - INFO=1; - IPARAM[2]=maxit*100; - } - if (minmax){ // make them a bit larger/smaller to be sure that we are in the interval of interrest ... - *lambda_min=WORKL[IPNTR[5]-1]*(1+TOL); - } else { - *lambda_max=WORKL[IPNTR[5]-1]*(1-TOL); - } - } - /* FORTRAN Comments of the dnaupd - c IPNTR(6): pointer to the real part of the ritz value array - c RITZR in WORKL. - c IPNTR(7): pointer to the imaginary part of the ritz value array - c RITZI in WORKL. - c IPNTR(8): pointer to the Ritz estimates in array WORKL associated - c with the Ritz values located in RITZR and RITZI in WORK - */ - if (warnings > 1) { - printf("Range of Farfield-Mobility: [%e, %e]\n", *lambda_min, *lambda_max); - } - cuda_safe_mem(cudaFree((void*)vec_in_d)); - cuda_safe_mem(cudaFree((void*)vec_out_d)); -} - -/// \brief caclulate the chebyshev coefficents for the squareroot of the matrix -/// It needs the largest and smalles eigenvalue which have to be computed before. -/// Using chebyshev-gausquadrature \link https://en.wikipedia.org/wiki/Chebyshev%E2%80%93Gauss_quadrature \endlink. -/// -/// \param lambda_min : the lower boundery -/// \param lambda_max : the upper boundery of the interval -/// \param tol : the given tollerance which should be achieved -/// \param coefficents : the pointer where the data will be stored -real calculate_chebyshev_coefficents(real lambda_min, real lambda_max, real tol,real ** coefficents){ - const int steps=1024*128; // with 1024 it should fit in L1 - unsigned int N=1024; // guess the number of coefficents we need, if more are needed -> realocate - if (*coefficents==NULL){ - *coefficents = (real *)Utils::malloc(N*sizeof(real)); - } else { - *coefficents = (real *)Utils::realloc(*coefficents,N*sizeof(real)); - } - real * current_polynome=NULL; - real * last_polynome=NULL; - last_polynome = (real *) Utils::malloc(steps * sizeof(real)); - current_polynome = (real *) Utils::malloc(steps * sizeof(real)); - real x[steps]; - real weight_and_func[steps]; - real lambda_m=lambda_max-lambda_min; - real lambda_p=lambda_max+lambda_min; - //fprintf(stderr,"lambda_min: %e lambda_max: %e ", lambda_min, lambda_max); - //fprintf(stderr,"lambda_minus: %e lambda_plusminus: %e\n", lambda_m, lambda_pm); - // init - real fac=2./steps; - //fprintf(stderr,"fac: %e\n", fac); - double ai=0; - double a1=0; - for (int i=0;i N){ - N*=2; - *coefficents=(real *)Utils::realloc(*coefficents,N*sizeof(real)); - } - } while ((error > tol*totalsum/10 || loop < 20 ) && loop < sqrt(steps)); - if (loop >=steps/10 -1 ){ - fprintf(stderr,"to few steps to get sufficent results in %s l. %d\n",__FILE__,__LINE__); - } - error=0; - while (error < tol*totalsum){ // approximate error - loop--; - error+=abs((*coefficents)[loop]);//*sumfacmax; - //sumfacmax/=lambda_max; - } - //fprintf(stderr,"sum: %e error: %e",totalsum,error); - loop++; - free(last_polynome); - free(current_polynome); - if (warnings > 1) fprintf(stderr,"chebyshev needs %d gemv\n",loop); - return loop; -} -/// \brief Compute the brownian force resulting from the farfield -/// To compute the farfield contribution of the brownian motion the -/// squareroot of the inverse of the mobility is required. -/// Instead of calculating it directly, its action on the vector -/// containing the randomnumbers is computed via the chebyshev-polynomials. -/// In the case of FF_ONLY this computes directly the displacement! -real sd_compute_brownian_force_farfield(const matrix & mobility, const real * gaussian_ff, - real tol, real * brownian_force_ff ){ - - cudaCheckError("Unknown Error"); - int size=mobility.size; - static real * cheby_coefficents=NULL; - static int N_chebyshev; - static bool recalc_ew = true; - static real lambda_min, lambda_max; - static int count_recalc=0; - static int count_total=0; - count_total++; - cudaCheckError("Unknown Error"); - if (recalc_ew){ - count_recalc++; - calculate_maxmin_eigenvalues(mobility,&lambda_min, &lambda_max, tol); - //printf("lambda min: %e, lambda_max: %e",lambda_min, lambda_max); - N_chebyshev = calculate_chebyshev_coefficents(lambda_min, lambda_max,tol*tol,&cheby_coefficents); - //printf("\nWe need %d iteration for Chebyshev.\n", N_chebyshev); - recalc_ew=false; - if (warnings>1){printf("recalc ratio: %6f\n",(((double)count_recalc)/((double)count_total)));} - } - if (lambda_min < 0){ - std::cerr << "Mobility has negative eigenvalues!\n" << std::endl; - std::cout << "Mobility has negative eigenvalues!\n" << std::endl; - if (mobility.wavespace != NULL){ - //printMatrixDev(mobility); - } - if (warnings > 2){ - //printMatrixDev(mobility); - } - //printMatrixDev(mobility.data,mobility.ldd,mobility.size,"Mobility has negative eigenvalues!\n"); - errexit(); - } - real * chebyshev_vec_curr, * chebyshev_vec_last, * chebyshev_vec_next; - sd_set_zero<<<64,192>>>(brownian_force_ff,size); - cudaThreadSynchronize(); - cudaCheckError("set zero"); - chebyshev_vec_curr=NULL; - cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_curr, size*sizeof(real) )); assert(chebyshev_vec_curr != NULL); -#ifdef SD_FF_ONLY - const real one =1; - real gMg; - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, mobility_d, lda, gaussian_ff, 1, &zero, chebyshev_vec_curr, 1)); - sd_Rgemv(&one, mobility, gaussian_ff, chebyshev_vec_curr); - cublasCall(cublasRdot(cublas, size, chebyshev_vec_curr, 1, gaussian_ff, 1, &gMg)); -#else - real gaussian_ff_norm; - cublasCall(cublasRnrm2(cublas, size, gaussian_ff, 1, &gaussian_ff_norm)); -#endif - cublasCall(cublasRcopy( cublas, size, gaussian_ff, 1, chebyshev_vec_curr, 1)); - cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+0, chebyshev_vec_curr, 1, brownian_force_ff, 1 )); - //chebyshev_vec_last=chebyshev_vec_curr; - chebyshev_vec_last=NULL; - cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_last, size*sizeof(real) )); assert(chebyshev_vec_last != NULL); - chebyshev_vec_next=NULL; - cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_next, size*sizeof(real) )); assert(chebyshev_vec_next != NULL); - //sd_set_zero<<<64,192>>>(chebyshev_vec_????,size); - real lambda_minus=lambda_max-lambda_min; - real alpha=2./lambda_minus; - sd_Rgemv(&alpha, mobility,gaussian_ff, chebyshev_vec_next); - //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, gaussian_ff, 1, &zero, chebyshev_vec_next , 1)); - alpha=-(lambda_min+lambda_max)/lambda_minus; - cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_curr, 1, chebyshev_vec_next, 1 )); - std::swap(chebyshev_vec_curr,chebyshev_vec_next); - std::swap(chebyshev_vec_last,chebyshev_vec_next); - cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+1, chebyshev_vec_curr, 1, brownian_force_ff, 1 )); - for (int i=2;i<=N_chebyshev;i++){ - alpha=4./lambda_minus; - sd_Rgemv(&alpha, mobility, chebyshev_vec_curr, chebyshev_vec_next); - //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, chebyshev_vec_curr, 1, &zero, chebyshev_vec_next , 1)); - alpha=-2*(lambda_min+lambda_max)/lambda_minus; - cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_curr, 1, chebyshev_vec_next, 1 )); - alpha=-1; - cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_last, 1, chebyshev_vec_next, 1 )); - std::swap(chebyshev_vec_curr,chebyshev_vec_next); - std::swap(chebyshev_vec_last,chebyshev_vec_next); - cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+i, chebyshev_vec_curr, 1, brownian_force_ff, 1 )); - } -#ifdef SD_DEBUG - assert(isSymmetricDev(mobility.data,mobility.ldd,mobility.size)); -#endif - // errorcheck of chebyshev polynomial -#ifdef SD_FF_ONLY - real dispdisp; - cublasCall(cublasRdot(cublas, size, brownian_force_ff, 1,brownian_force_ff, 1, &dispdisp)); - real E_cheby = sqrt(abs(dispdisp-gMg)/gMg); -#else - real zMz; - alpha = 1; - sd_Rgemv(&alpha, mobility, brownian_force_ff, chebyshev_vec_last); - //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, brownian_force_ff, 1, &zero, chebyshev_vec_last, 1)); - cublasCall(cublasRdot(cublas, size, chebyshev_vec_last, 1, brownian_force_ff, 1, &zMz)); - real E_cheby = sqrt(abs(zMz-gaussian_ff_norm*gaussian_ff_norm))/gaussian_ff_norm; -#endif - if (E_cheby > tol){ - recalc_ew=true; - } - cuda_safe_mem(cudaFree((void*)chebyshev_vec_last)); - cuda_safe_mem(cudaFree((void*)chebyshev_vec_curr)); - cuda_safe_mem(cudaFree((void*)chebyshev_vec_next)); - return E_cheby; -} - -void sd_Rgemv(const real * factor, const matrix & A, const real * in, real * out){ - A.assert_all(); - if(A.is_sparse){ - int threads=A.size; - const int tpB = numThreadsPerBlock; - int blocks = ((threads+tpB-1)/tpB); - sd_multiply_sparse_matrix_vector<<>>(A.size, *factor, A.data, A.ldd, A.ldd_short, A.col_idx, A.row_l, in, out ); - cudaThreadSynchronize(); - real tmp; - cublasCall(cublasRnrm2(cublas, A.size, out,1, &tmp)); - cudaCheckError("Sparse Matrix Vector Multiplication"); - } - else{ - assert(A.ldd); - const real zero=0; - #ifdef SD_USE_FLOAT - cublasCall(cublasSgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.data, A.ldd, in, 1, &zero, out, 1)); - #else - cublasCall(cublasDgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.data, A.ldd, in, 1, &zero, out, 1)); - #endif - } - if(A.wavespace != NULL){ - /*real insize; - cublasCall(cublasRnrm2(cublas, A.size, in, 1, &insize)); - real before; - cublasCall(cublasRnrm2(cublas, A.size, out, 1, &before));*/ - wavepart & wave=* A.wavespace; - int buf_size = A.wavespace->max*3; - real * sin_sum=NULL; - cuda_safe_mem(cudaMalloc( (void**)& sin_sum, buf_size*sizeof(real))); - real max; - sd_nrm1<<<1,numThreadsPerBlock>>>(A.size,in,sin_sum); - cuda_safe_mem(cudaMemcpy(&max, sin_sum, sizeof(real), cudaMemcpyDeviceToHost)); - sd_set_zero<<<64,192>>>(sin_sum,buf_size); - //fprintf(stderr,"buf_size is %d\n",buf_size); - real * cos_sum=NULL; - cuda_safe_mem(cudaMalloc( (void**)& cos_sum, buf_size*sizeof(real))); - sd_set_zero<<<64,192>>>(cos_sum,buf_size); - cudaThreadSynchronize(); - cudaCheckError(""); - // thread number has to be a power of two! - //sd_wavepart_sum<<>>(in, wave.vecs, wave.num, wave.matrices, wave.sines, wave.cosines, \ - //fprintf(stderr,"calling sd_wavepart_sum<<<%d,%d,%d>>>\n",wave.num,8,8*3*sizeof(real)); - // ,------ has to be power of two and at least 8 (and there might be still a bug if larger than 32) - // \|/ - // v - sd_wavepart_sum<<>>(in, wave.vecs, wave.num, wave.matrices, wave.sines, wave.cosines, \ - A.ldd_short, A.size/3, sin_sum, cos_sum, max); - cudaThreadSynchronize(); - cudaCheckError(""); - int tpB = 32; - int blocks = (A.ldd_short)/tpB; - sd_wavepart_assemble<<>>(wave.num, wave.sines, wave.cosines, sin_sum, cos_sum, - A.ldd_short, out, max, A.size/3, *factor); - //sd_wavepart_assemble_block<<>>(wave.num, wave.sines, wave.cosines, sin_sum, cos_sum, - // A.ldd_short, out, max, A.size/3, *factor); - cudaThreadSynchronize(); - cudaCheckError(""); - /*#ifdef SD_DEBUG - if (A.dense){ - printVectorDev(sin_sum,buf_size,"insin_sum"); - printVectorDev(cos_sum,buf_size,"incos_sum"); - real * correct; - cuda_safe_mem(cudaMalloc( (void**)&correct, A.ldd*sizeof(real) )); - const real zero=0; - #ifdef SD_USE_FLOAT - cublasCall(cublasSgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.dense, A.ldd, in, 1, &zero, correct, 1)); - #else - cublasCall(cublasDgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.dense, A.ldd, in, 1, &zero, correct, 1)); - #endif - const real minusone = -1; - cublasCall(cublasRaxpy(cublas, A.size, &minusone, out, 1 ,correct, 1)); - real erg; - cublasCall(cublasRnrm2(cublas, A.size, correct, 1 , &erg)); - if (erg < 1e-3){ - std::cout << "s"; - printVectorDev(in ,A.size,"\nin, worked "); - printVectorDev(correct,A.size,"correct "); - printVectorDev(out ,A.size,"out "); - } else { - A.print(); - cublasCall(cublasRaxpy(cublas, A.size, &minusone, out, 1 ,correct, 1)); - printVectorDev(in ,A.size,"in, failed "); - printVectorDev(correct,A.size,"correct "); - printVectorDev(out ,A.size,"out "); - - } - cuda_safe_mem(cudaFree((void*) correct)); - } - if (warnings > 20){ - printVectorDev(sin_sum,buf_size,"sin_sum"); - printVectorDev(cos_sum,buf_size,"cos_sum"); - } - #endif*/ - cuda_safe_mem(cudaFree((void*) sin_sum)); - cuda_safe_mem(cudaFree((void*) cos_sum)); - } -} - - -void sd_compute_mobility_matrix_wave_cpu(int kmax, real kmax2, matrix & mobility, real a,const real * L_h, real xi, real selfmobility, real ew_prec){ -#ifdef SD_DEBUG - //kmax=1; - //kmax2=kmax*kmax; -#endif - cudaCheckError(""); - real fak=selfmobility; - for(int k=0;k<3;k++){ - fak/=L_h[k]; - } - //std::cerr < 0 ){ - vc++; - } - } - } - } - if (!vc){ // nothing to do - return; - } - if (mobility.wavespace == NULL){ - mobility.wavespace = new wavepart(mobility.ldd_short); - //mobility.wavespace = (wavepart *) Utils::malloc(sizeof(wavepart)); - //mobility.wavespace->wavepart(); - } - mobility.wavespace->num = vc; - int max = vc; - if (max != mobility.wavespace->num){ - mobility.wavespace->num=max; - } - assert(max); - if (max > mobility.wavespace->max){ - mobility.wavespace->max = ((max+31)/32)*32; - max=mobility.wavespace->max; - //fprintf(stderr,"there are %d wavespace contribution taking %d places",vc, max); - cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->vecs) , max*3*sizeof(real) )); - cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->matrices) , max*6*sizeof(real) )); - cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->sines) , max*mobility.ldd_short*sizeof(real) )); - cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->cosines) , max*mobility.ldd_short*sizeof(real) )); - cudaCheckError(""); - sd_set_zero<<<640,64>>>(mobility.wavespace->sines , max*mobility.ldd_short); - sd_set_zero<<<640,64>>>(mobility.wavespace->cosines , max*mobility.ldd_short); - cudaCheckError(""); - } - int ldd = mobility.wavespace->max; - real vecs_h[3 * ldd]; - real matrices_h[6 * ldd]; - vc=0; - for (cki[0]=-kmax;cki[0] 0){ - k[2]=cki[2]*ki[2]; - real k2=k[0]*k[0]+ k[1]*k[1] + k[2]*k[2]; - for (int i = 0;i<3;i++){ - vecs_h[vc*3+i]=k[i]; - } - real scal = (a-1./3.*a*a*a*k2)*(1+0.25*xi2*k2+0.125*xi2*xi2*k2*k2)*6.*M_PI/k2*exp(-0.25*xi2*k2)*fak; - //if (cki[0]==-kmax){ - // printf("scal is %e, exp is %e\n",scal/fak,exp(-0.25*xi2*k2)); - //} - //if (scal/kvol > 1e-6*ew_prec){ - for (int i = 0 ; i < 3 ; i++ ){ - matrices_h[vc*6+i] = (1-(k[i]*k[i]/k2))*scal; - } - matrices_h[vc*6+3] = (-k[0]*k[1]/k2)*scal; - matrices_h[vc*6+4] = (-k[0]*k[2]/k2)*scal; - matrices_h[vc*6+5] = (-k[1]*k[2]/k2)*scal; - /*if (scal/fak > 1e-5){ - printf("scal is %e, k is %e",scal/fak,sqrt(k2)); - printf("\tk=[%4.0f, %4.0f, %4.0f] ",k[0]*L_h[0]/2/M_PI,k[1]*L_h[1]/2/M_PI,k[2]*L_h[2]/2/M_PI); - printf(" m=[%12.4e, %12.4e, %12.4e,%12.4e, %12.4e, %12.4e] \n",matrices_h[vc*6],matrices_h[vc*6+1],matrices_h[vc*6+2],matrices_h[vc*6+3],matrices_h[vc*6+4],matrices_h[vc*6+5]); - }*/ - vc++; - //} - } - } - } - } - mobility.wavespace->num = vc; - { - static int printed=0; - if ((printed%reprint)==1){ - printf("\nwe need %d k vectors\n",vc); - } - printed++; - } - cudaCheckError(""); - //fprintf(stderr,"h: 0x%x d: 0x%x, size: %d",vecs_h,mobility.wavespace->vecs, ldd*3*sizeof(real)); - cuda_safe_mem(cudaMemcpy(mobility.wavespace->vecs, vecs_h, ldd*3*sizeof(real),cudaMemcpyHostToDevice)); - cuda_safe_mem(cudaMemcpy(mobility.wavespace->matrices,matrices_h,ldd*6*sizeof(real),cudaMemcpyHostToDevice)); - //mobility.wavespace->hash_vecs(); -} - -void _cuda_check_error(char *file, unsigned int line){ - cudaError_t err=cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "Error found during memory operation. Possibly however from an failed operation before. %s:%u.\n", file, line); - printf("CUDA error: %s\n", cudaGetErrorString(err)); - if ( err == cudaErrorInvalidValue ) - fprintf(stderr, "You may have tried to allocate zero memory before %s:%u.\n", file, line); - exit(EXIT_FAILURE); - } -} - - - -int sd_find_max(int * data, int length){ - int tlength=length; - int * tdata=data; - if (tlength >= 512){ - int bs=32; - if (tlength > 4*1024){ - bs=128; - } else if (tlength > 1024){ - bs=64; - } - tlength = (length+bs-1)/bs; - cuda_safe_mem(cudaMalloc((void **) &tdata,tlength*sizeof(int))); - sd_nrm_inf<<>>(length,data,tdata); - } - //if (length < 64){ // copy to host and compare - int tmp[tlength]; - cuda_safe_mem(cudaMemcpy(tmp,tdata,tlength*sizeof(int),cudaMemcpyDeviceToHost)); - int result=tmp[0]; - for (int i=1;i result){ - result=tmp[i]; - } - } - if (length>=512) { - cuda_safe_mem(cudaFree((void*)tdata)); - } - return result; -} - - - -#endif diff --git a/src/core/integrate_sd_cuda.hpp b/src/core/integrate_sd_cuda.hpp deleted file mode 100644 index 115ce5437e8..00000000000 --- a/src/core/integrate_sd_cuda.hpp +++ /dev/null @@ -1,251 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef __INTEGRATE_SD_CUDA_HPP -#define __INTEGRATE_SD_CUDA_HPP - -#include "integrate_sd.hpp" - -#ifdef SD -#ifdef CUDA -#include - -void errexit(); - -#ifndef SD_USE_FLOAT -typedef double real; -#define cublasRgemm(...) cublasDgemm( __VA_ARGS__) -#define cublasRdot(...) cublasDdot( __VA_ARGS__) -//#define cublasRgemv(...) cublasDgemv( __VA_ARGS__) // always use sd_Rgemv -#define cublasRcopy(...) cublasDcopy( __VA_ARGS__) -#define cublasRaxpy(...) cublasDaxpy( __VA_ARGS__) -#define cublasRscal(...) cublasDscal( __VA_ARGS__) -//#define cublasRnrm2(...) cublasDnrm2( __VA_ARGS__) -#define cublasRnrm2(a1, a2, a3, a4, a5) cublasDdot(a1,a2, a3, a4, a3, a4, a5));*(a5)=sqrt(*(a5) -#define cublasRtrsv(...) cublasDtrsv( __VA_ARGS__) -#define curandGenerateNormalReal(...) curandGenerateNormalDouble(__VA_ARGS__) -#define __real2int_rd(...) __double2int_rd(__VA_ARGS__) -#define __real2int_ru(...) __double2int_ru(__VA_ARGS__) -#define rnaupd(...) dnaupd_(__VA_ARGS__) -#else // use float -typedef float real; -#define cublasRgemm(...) cublasSgemm( __VA_ARGS__) -#define cublasRdot(...) cublasSdot( __VA_ARGS__) -//#define cublasRgemv(...) cublasSgemv( __VA_ARGS__) // always use sd_Rgemv -#define cublasRcopy(...) cublasScopy( __VA_ARGS__) -#define cublasRaxpy(...) cublasSaxpy( __VA_ARGS__) -#define cublasRscal(...) cublasSscal( __VA_ARGS__) -//#define cublasRnrm2(...) cublasSnrm2( __VA_ARGS__) -#define cublasRnrm2(a1, a2, a3, a4, a5) cublasSdot(a1,a2, a3, a4, a3, a4, a5));*(a5)=sqrt(*(a5) -#define cublasRtrsv(...) cublasStrsv( __VA_ARGS__) -#define curandGenerateNormalReal(...) curandGenerateNormal(__VA_ARGS__) -#define __real2int_rd(...) __float2int_rd(__VA_ARGS__) -#define __real2int_ru(...) __float2int_ru(__VA_ARGS__) -#define rnaupd(...) snaupd_(__VA_ARGS__) -#endif //#ifndef SD_USE_FLOAT - -#include "cuda_utils.hpp" -#include "integrate_sd.hpp" -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_matrix.hpp" - -extern cublasHandle_t cublas; - -extern double temperature; ///< this is defined in \file thermostat.cpp - -const int numThreadsPerBlock = 32; ///< gives the number of threads per cuda block. should be a multiple of 32 and much smaller then the numbers of particles -#define numThreadsPerBlock_is_power_of_two TRUE - -#ifndef SD_FF_ONLY -const int bucket_size_factor = 3; //TODO: find optimal parameter for given implementation -#endif - -void _cudaCheckError(const char *msg, const char * file, const int line); -#define cudaCheckError(msg) cudaThreadSynchronize();_cudaCheckError((msg),__FILE__,__LINE__) - -#define myindex(i,j) ((i)*(lda)+(j)) - -#define SQR(x) (x)*(x) - -#define cublasCall(call) { cublasStatus_t stat=(call); \ - if (stat != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "CUBLAS failed in " << __FILE__ \ - << " l. " << __LINE__ <<"\n\t" ; \ - if (stat == CUBLAS_STATUS_NOT_INITIALIZED){ \ - std::cerr << "the CUDA Runtime initialization failed.\n"; \ - } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { \ - std::cerr << "the resources could not be allocated\n"; \ - } else { \ - std::cerr << "unknown error\n"; \ - } \ - errexit(); \ - } \ - } -#define curandCall(call) { curandStatus_t stat =(call); \ - if (stat != CURAND_STATUS_SUCCESS) { \ - std::cerr << "CURAND failed in " << __FILE__ \ - << " l. " << __LINE__ <<"\n\t" ; \ - if (stat == CURAND_STATUS_NOT_INITIALIZED){ \ - std::cerr << "the CUDA Runtime initialization failed.\n"; \ - } else if (stat == CURAND_STATUS_VERSION_MISMATCH) { \ - std::cerr << "Header file and linked library version do not match.\n"; \ - } else if (stat == CURAND_STATUS_ALLOCATION_FAILED) { \ - std::cerr << "Memory allocation failed.\n"; \ - } else if (stat == CURAND_STATUS_TYPE_ERROR) { \ - std::cerr << "Generator is wrong type.\n"; \ - } else if (stat == CURAND_STATUS_OUT_OF_RANGE) { \ - std::cerr << "the argument was out of range.\n"; \ - } else if (stat == CURAND_STATUS_LENGTH_NOT_MULTIPLE) { \ - std::cerr << "Length requested is not a multple of dimension.\n"; \ - } else if (stat == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED) { \ - std::cerr << "GPU does not have double precision required by MRG32k3a.\n"; \ - } else if (stat == CURAND_STATUS_LAUNCH_FAILURE) { \ - std::cerr << "Kernel launch failure.\n"; \ - } else if (stat == CURAND_STATUS_PREEXISTING_FAILURE) { \ - std::cerr << "Preexisting failure on library entry.\n"; \ - } else if (stat == CURAND_STATUS_INITIALIZATION_FAILED) { \ - std::cerr << "Initialization of CUDA failed.\n"; \ - } else if (stat == CURAND_STATUS_ARCH_MISMATCH) { \ - std::cerr << "Architecture mismatch, GPU does not support requested feature.\n"; \ - } else if (stat == CURAND_STATUS_INTERNAL_ERROR) { \ - std::cerr << "Internal library error.\n";\ - } else { \ - fprintf(stderr,"unknown error (error number: %d)\n",stat); \ - } \ - errexit(); \ - } \ - } - -// headers for ARPACK-NG: http://forge.scilab.org/index.php/p/arpack-ng/ -extern "C" -{ - void dnaupd_(int* IDO, char* BMAT, int* N, char WHICH[], int* NEV, double* TOL, double RESID[], int* NCV, double V[], int* LDV, int IPARAM[], - int IPNTR[], double WORKD[], double WORKL[], int* LWORKL, int* INFO); - void snaupd_(int* IDO, char* BMAT, int* N, char WHICH[], int* NEV, float* TOL, float RESID[], int* NCV, float V[], int* LDV, int IPARAM[], - int IPNTR[], float WORKD[], float WORKL[], int* LWORKL, int* INFO); -} - -#define SD_RESISTANCE_CORRECT - -/* ************************************* * - * ******* private functions ******* * - * ************************************* */ -void sd_compute_displacement(const real * r_d, int N, real viscosity, real radius, const real * L_d, const real * L_h, - const real * total_mobility_d, const real * force_d, real * disp_d, int * myInfo); - - -// this solves iteratively using CG -// disp * (1+resistance*mobility) = mobility_d * force_d -// and returnes disp -// mobility and resistance are square matrizes with size and lda <((size+31)/32)*32> -// force and disp are vectors of size -// recalc whether to mobility has changed since the last call or not -real sd_iterative_solver(const matrix & mobility, const matrix & resistance, const real * force, real * disp, real rel_err, real abs_err, bool recalc); - -// description below -// in the case of FF_ONLY this computes directly the displacement -real sd_compute_brownian_force_farfield(const matrix & mobility, const real * gaussian_ff, - real tol, real * brownian_force_ff ); - - -/// This computes the k-vectors and the wavespace mobility matrices as defined be Beenakker 1986 -void sd_compute_mobility_matrix_wave_cpu(int kmax, real kmax2, matrix & mobility, real a, const real * L_h, real xi, real selfmobility, real ew_prec); - - - - -void _cuda_check_error(char *file, unsigned int line); -#define cuda_check_error(); _cuda_check_error(__FILE__,__LINE__); -// BICGSTAB-Solver -// implimented as given in `Numerik linearer Gleichungssysteme` by Prof. Dr. Andreas Meister -// additional a preconditioner can be used (compile-time-switch). This overwrites the soluten vector b -// this solves A*x=b -// cublas a handle for cublas -// size the size n of the matrix -// A the given n*n matrix (in) -// lda the leading demension of A -// b the given solution vector (in) -// tol requested tolerance of the solution -// maxit maximum number of iterations -// x the requested solution with an initial guess (in/out) -// returns 0 on success, else error code -/*#ifdef SD_PC -int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A, int lda, - real * b, real tol,real abs_tol, int maxit, real * x, real * res); -#else -int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A1, const real * A2, int lda, - real * b, real tol,real abs_tol, int maxit, real * x, real * res); - #endif*/ - -/// GMRes-Solver -// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister -// additional a preconditioner can be used (compile-time-switch). This overwrites the soluten vector b -// this solves A*x=b -// cublas a handle for cublas -// size the size n of the matrix -// A the given n*n matrix (in) (to use with preconditioning) -// A1,A2 the matrizes that give A=A1*A2+1 (to use without preconditioning) -// lda the leading demension of A -// b the given solution vector (in) -// tol requested tolerance of the solution -// maxit maximum number of iterations -// x the requested solution with an initial guess (in/out) -// returns 0 on success, else error code -#ifdef SD_PC -int sd_gmres_solver(int size, const real * A, int lda, - real * b, real tol,real abs_tol, int maxit, real * x, real * res); -#else -int sd_gmres_solver(const matrix & A1, const matrix & A2, - const real * b, real tol,real abs_tol, int maxit, real * x, real * res); -#endif - - -// calculates the largest and snalles eigenvalue of the matrix -// mobility : the mobility matrix (data on the device) (IN) -// lambda_min : smalles eigenvalue (OUT) -// lambda_max : largest eigenvalue (OUT) -void calculate_maxmin_eigenvalues(const matrix & mobility, real * lamba_min, real * lambda_max, real tol); - -/// matrix vector multiplication -/// this solves \[ out = factor * A * in \] -/// where A is a matrix (see struct matrix) -/// in and out are vectors -/// factor is a scalar -void sd_Rgemv(const real * factor,const matrix & A,const real * in, real * out); - - -// compute the chebyshev coeffeicents which are needed to achieve a given accuracy. -// lambda_min : the lower boundery -// lambda_max : the upper boundery of the interval -// tol : the given tollerance which should be achieved -// coefficents : the pointer where the data will be stored -real calculate_chebyshev_coefficents(real lambda_min, real lambda_max, real tol,real ** coefficents); - -// find the maximal value of an array on the GPU -// data : the array -// length : number of elements -// return value : the maximal value -int sd_find_max(int * data, int length); - -#endif /* SD */ -#endif /* CUDA */ - -#endif diff --git a/src/core/integrate_sd_cuda_debug.cu b/src/core/integrate_sd_cuda_debug.cu deleted file mode 100644 index 678633130ae..00000000000 --- a/src/core/integrate_sd_cuda_debug.cu +++ /dev/null @@ -1,337 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "config.hpp" - -#include -#include -#include -#include -#include - -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_cuda_debug.hpp" -#include "integrate_sd_cuda_kernel.hpp" -//#include "errorhandling.hpp" -#include "cuda_utils.hpp" - -#if defined(OMPI_MPI_H) || defined(_MPI_H) -#error CU-file includes mpi.h! This should not happen! -#endif - - -/* *************************************************************************************************************** * - * ******************************************** DEBUGING & TESTs ******************************************** * - * *************************************************************************************************************** */ -#ifdef SD - -int sd_test(int size, int type){ - assert(size%3 == 0); - int lda=((size+31)/32)*32; - matrix mat; - mat.ldd=lda; - mat.size=size; - real * x1=NULL, * x2=NULL, * b =NULL; - cuda_safe_mem(cudaMalloc((void**) &mat.data ,size*lda*sizeof(real))); - cuda_safe_mem(cudaMalloc((void**) &mat.col_idx,size*lda*sizeof(int))); - cuda_safe_mem(cudaMalloc((void**) &mat.row_l ,size*sizeof(int))); - cuda_safe_mem(cudaMalloc((void**) &x1 ,size*sizeof(real))); - cuda_safe_mem(cudaMalloc((void**) &x2 ,size*sizeof(real))); - cuda_safe_mem(cudaMalloc((void**) &b ,size*sizeof(real))); - sd_set_zero_matrix<<<32,32>>>(mat.data,mat.size,mat.ldd); - sd_set_zero<<<32,32>>>(b,size); - for (int i=0;i>>(mat.col_idx+i*mat.ldd,lda,i); - } - sd_set_value<<<32,32>>>(mat.row_l,mat.ldd, mat.size/3); - //cublasHandle_t cublas=NULL; // use global - cublasCall(cublasCreate(&cublas)); - curandGenerator_t generator = NULL; - curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - curandCall(curandSetPseudoRandomGeneratorSeed(generator, (unsigned long long)('E'+'S'+'P'+'R'+'e'+'s'+'S'+'o'))); - curandCall(curandSetGeneratorOrdering( generator, CURAND_ORDERING_PSEUDO_BEST)); - curandCall(curandSetGeneratorOffset( generator, 0)); - curandCall(curandGenerateNormalReal( generator, b, size, 0, sqrt(2.))); - curandCall(curandGenerateNormalReal( generator, x1, size, 0, sqrt(2.))); - curandCall(curandGenerateNormalReal( generator, x2, size, 0, sqrt(2.))); - if (type==0){ - ; - } - else{ - curandCall(curandGenerateNormalReal( generator, mat.data, mat.size*mat.ldd, 0, sqrt(2.))); - } - mat.is_sparse=false; - const real one=1; - const real minusone=-1; - sd_Rgemv(&one, mat, b,x1); - mat.is_sparse=true; - sd_Rgemv(&one, mat, b,x2); - real tmp; - cublasCall(cublasRnrm2(cublas, size, x1, 1, &tmp)); - std::cout << "x1 norm: " << tmp; - cublasCall(cublasRnrm2(cublas, size, x2, 1, &tmp)); - std::cout << " x2 norm: " << tmp; - cublasCall(cublasRaxpy(cublas, size, &minusone, x1,1,x2,1)); - cublasCall(cublasRnrm2(cublas, size, x2, 1, &tmp)); - std::cout << " diff norm: " << tmp; - return 0; -} - - -void printVectorDev( const real * data, const int m, const char * msg){ - std::cout << msg; - printVectorDev(data,m); -} -void printVectorDev( const real * data, const int m){ - real host[m]; - assert(m>0); - cuda_safe_mem(cudaMemcpy( host, data, m*sizeof(*data), cudaMemcpyDeviceToHost )); - int count = 1; - char komma[2]={0,0}; - std::cout << " = ["; - for (int j=0; j < m;j++){ - bool print=true; - //std::cout << ",\t" << std::setw(20) << std::setprecision(15) << host[j]; - if (j < m-1){ - if (host[j]==host[j+1]){ - count++; - print=false; - } - } - if (print){ - if (count > 3){ - printf("%s\t%12.4e ",komma,host[j],count); - komma[0]=','; - count = 1; - } else { - for (; count > 0 ; count--){ - printf("%s\t%12.4e",komma,host[j]); - komma[0]=','; - } - count = 1; - } - } - } - std::cout << "];\n"; -} - - -void printVectorHost( const real * host, const int m, const char * msg){ - std::cout << msg; - printVectorHost(host,m); -} - -void printVectorHost( const real * host, const int m){ - for (int j=0; j < m;j++){ - std::cout << ",\t" << std::setw(20) << std::setprecision(15) << host[j]; - } - std::cout << "\n"; -} - -void printPosDev( real * data, int m, const char * msg){ - std::cout << msg; - real host[m*3]; - cuda_safe_mem(cudaMemcpy( host, data,3*m*sizeof(*data), cudaMemcpyDeviceToHost )); - for (int l=0; lmax){ - max=abs(data[i]); - } - } - return max; -} - - -real getAbsMaxDev(const real * data_d, int length){ - real host[length]; - cuda_safe_mem(cudaMemcpy( host, data_d, length*sizeof(*data_d), cudaMemcpyDeviceToHost )); - return getAbsMax(host,length); -} - - -int countMinusDev(const real * data_d, int length){ - real host[length]; - cuda_safe_mem(cudaMemcpy( host, data_d, length*sizeof(*data_d), cudaMemcpyDeviceToHost )); - int c=0; - for (int i=0;i 1e-8){ - fprintf(stderr,"not symmetric: elements (%d,%d) and (%d,%d)\n",i,j,j,i); - return false; - } - } - } - #endif - return true; -} - - - -void printMatrixDev( const matrix & mat){ - /*if (mat.wavespace){ - printf("wavespace:: max entries: %d, actual entries: %d\n",mat.wavespace->max,mat.wavespace->num); - printVectorDev(mat.wavespace->matrices,mat.wavespace->max*6, "matrices"); - printVectorDev(mat.wavespace->vecs ,mat.wavespace->max*3, "vecs"); - }*/ - mat.printWavespace(); - printMatrixDev(mat.data,mat.ldd,mat.size); -} - - -void matrix::printWavespace() const{ - if (wavespace){ - wavespace->print(size/3,ldd_short); - }else { - printf("No wavespace available!\n"); - } -} -void matrix::print() const{ - printMatrixDev(*this); -} - -void wavepart::print(int N, int ldd) const{ - printVectorDev(sines,max*ldd); - printVectorDev(cosines,max*ldd); - assert(max > 0); - real vec_h[max*3]; - real mat_h[max*6]; - real sin_h[max*ldd]; - real cos_h[max*ldd]; - //real box_l[]={8,8,8}; - extern double box_l[3]; - cuda_safe_mem(cudaMemcpy( vec_h, vecs, max*3*sizeof(real), cudaMemcpyDeviceToHost )); - cuda_safe_mem(cudaMemcpy( mat_h, matrices, max*6*sizeof(real), cudaMemcpyDeviceToHost )); - cuda_safe_mem(cudaMemcpy( sin_h, sines, max*ldd*sizeof(real), cudaMemcpyDeviceToHost )); - cuda_safe_mem(cudaMemcpy( cos_h, cosines, max*ldd*sizeof(real), cudaMemcpyDeviceToHost )); - printf("printing wavespace:\n"); - for (int i=0; i -#include -#include -#include - -#include "integrate_sd_cuda.hpp" - -#ifdef SD - -// template implementation below -template void printMatrixDev( T * data, int m, int n, const char * msg); -template void printMatrixDev( T * data, int m, int n); -void printMatrixDev( const matrix& mat); -template void printMatrixHost( T * data, int m, int n, const char * msg); - -template void printDistances( T * r_h, T, int N, T min, T max); -template void printDistances( T * r_h, T, int N); - -void printPosDev( real * data, int m, const char * msg); -void printVectorDev( const real * data, const int m); -void printVectorDev( const real * data, const int m, const char * msg); -//void printVectorDev( float * data, int m); -void printVectorHost( const real * data, const int m); -void printVectorHost( const real * data, const int m, const char * msg); -//void cudaCheckError(const char *msg); -bool hasAnyNanDev(const real * data, int length); -bool isSymmetricDev(const matrix & mat); -bool isSymmetricDev(const real * data, int lda, int size); - - -real getAbsMax(const real * data, int length); -real getAbsMaxDev(const real * data_d, int length); -int countMinusDev(const real * data_d, int length); - -class myTimer { -public: - // init numCounters counter - myTimer(unsigned int _numCounters); - ~myTimer(); - // add the time since the last call to counter _counter - void add(unsigned int _counter); - void print(unsigned int _counter,const char * msg); - void printAll(const char * msg); -private: - myTimer(const myTimer &t); // this far not implemented - do not copy by accident - unsigned int numCounters; - unsigned long long int * counters; - struct timespec last; -}; - - -/*********************************** - *** Template Implementation *** - **********************************/ -template -T mymin(T a, T b){ - return a>b?b:a; -} - -template -void printMatrixHost( T * host, int n, int m, const char * msg){ - std::cout << msg; - int maxn,maxm; - if (false){ - maxm=mymin(1000,m); - maxn=mymin(1000,n); - } else { - int sym = mymin(m,n); - maxm=mymin(1000,sym); - maxn=mymin(1000,sym); - } - if ( m<=0 || n<=0){ - return; - } - std::cout << "\nA=["; - for (int j=0; j < maxm;j++){ - std::cout <<"["; - std::cout << std::setw(12)<< host[n*j]; - for (int i =1; i -void printMatrixDev( T * data, int n, int m){ - T host[m*n]; - if ( m<=0 || n<=0){ - return; - } - else{ - cudaMemcpy( host, data, m*n*sizeof(*data), cudaMemcpyDeviceToHost ); - printMatrixHost(host,n,m,""); - } -} - - -template -void printMatrixDev( T * data, int m, int n, const char * msg){ - std::cout << msg; - printMatrixDev(data,m,n); -} - - -template void printDistances( T * r_h, T L, int N, T min, T max){ -#ifndef DIM - const int DIM=3; -#endif - for (int i = 0;i void printDistances( T * r_h, T L, int N){ - printDistances(r_h, L, N,0,L); -} - - -#endif /* SD */ - -#endif diff --git a/src/core/integrate_sd_cuda_device.cu b/src/core/integrate_sd_cuda_device.cu deleted file mode 100644 index e11366bd9f7..00000000000 --- a/src/core/integrate_sd_cuda_device.cu +++ /dev/null @@ -1,130 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_cuda_device.hpp" - -#if defined(OMPI_MPI_H) || defined(_MPI_H) -#error CU-file includes mpi.h! This should not happen! -#endif - -#ifdef SD - -/* *************************************************************************************************************** * - * ******************************************** DEVICE-Functions ******************************************** * - * *************************************************************************************************************** */ - -/// atomic add for doubles -/// address : pointer to which the value should be added -/// inc : value which should be added -__device__ double atomicAdd(double * address, double inc){ - - ull *addressUll = (ull*) address; - ull oldValue=*addressUll; - ull assumedValue; - assert(!isnan(inc)); - do { - assumedValue=oldValue; - ull newValue = __double_as_longlong (__longlong_as_double(assumedValue)+inc); - oldValue = atomicCAS(addressUll,assumedValue,newValue); - } - while (oldValue != assumedValue); - return __longlong_as_double(oldValue); -} - -/// modulo function for integers as implemented in python -/// returns value % max -/// which is always positive -__device__ __inline__ int fold_back_up(int value, int max){ - while (value < 0){ - value+=max; - } - while (value >= max){ - value-=max; - } - return value; -} - -/// reduction function returning maximum of all value. -/// has to be called by all threads -/// shared_cache should be a pointer to shared memory of at least size blockDim.x -/// blockDim.x has to be an even number -__device__ int reduce_max(int * shared_cache){ - for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){ - if (threadIdx.x < t){ - if (shared_cache[threadIdx.x]shared_cache[1]?shared_cache[0]:shared_cache[1]; - } - return shared_cache[0]; -} - - -__device__ int reduce_max(int * shared_cache, int value){ - shared_cache[threadIdx.x]=value; - /*if (threadIdx.x == 0){ - if ((((blockDim.x+1)/2)*2) > blockDim.x){ - shared_cache[blockDim.x+1]=0; - } - }*/ - //shared_cache[threadIdx.x+blockDim.x]=0; - __syncthreads(); - return reduce_max(shared_cache); -} - - - -/// reduction function returning sum of all values in shared_cache[0:blockDim.x-1] -/// has to be called by all threads -/// shared_cache should be a pointer to shared memory of at least size blockDim.x -__device__ void reduce_sum(real * shared_cache){ -#ifndef numThreadsPerBlock_is_power_of_two -#error numThreadsPerBlock has to be a power of two for effective reduction -#endif - //shared_cache[threadIdx.x]=value; - for (int t=(blockDim.x)/2;t>0;t=t/2){ - if (threadIdx.x < t){ - shared_cache[threadIdx.x]=shared_cache[threadIdx.x]+shared_cache[threadIdx.x+t]; - } - __syncthreads(); - } -} - -/// function to avoid caching if reading from global memory -/// addr : address from which the value should be read -__device__ __inline__ real read_without_caching( const real * addr){ - real tmp; -#ifdef SD_USE_FLOAT - asm("ld.global.cs.f32 %0,[%1];\n" - : "=f"(tmp) : "l"(addr) : ); -#else - asm("ld.global.cs.f64 %0,[%1];\n" - : "=d"(tmp) : "l"(addr) : ); -#endif - return tmp; -} - -#endif diff --git a/src/core/integrate_sd_cuda_device.hpp b/src/core/integrate_sd_cuda_device.hpp deleted file mode 100644 index 67bb68bf772..00000000000 --- a/src/core/integrate_sd_cuda_device.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef __INTEGRATE_SD_CUDA_DEVICE_HPP -#define __INTEGRATE_SD_CUDA_DEVICE_HPP - -#include "integrate_sd_cuda.hpp" - -#ifdef SD - -typedef unsigned long long ull; -// atomicAdd implementation for double -__device__ double atomicAdd(double * address, double inc); - -__device__ int fold_back_up(int value, int max); - -__device__ int reduce_max(int * shared_cache, int value); - -__device__ __inline__ real read_without_caching( const real * addr); - -__device__ void reduce_sum(real * shared_cache); - -#endif /* SD */ - -#endif diff --git a/src/core/integrate_sd_cuda_kernel.cu b/src/core/integrate_sd_cuda_kernel.cu deleted file mode 100644 index 5977c2eecee..00000000000 --- a/src/core/integrate_sd_cuda_kernel.cu +++ /dev/null @@ -1,1884 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "config.hpp" - -#include -#include -#include "cuda_runtime.h" -#include - -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_cuda_device.cu" - -#if defined(OMPI_MPI_H) || defined(_MPI_H) -#error CU-file includes mpi.h! This should not happen! -#endif - -#ifdef SD - -/* *************************************************************************************************************** * - * ******************************************** CUDA-KERNELS ******************************************** * - * *************************************************************************************************************** */ - - -/// This computes the farfield contribution of the mobility in the case of no -/// periodic boundary conditions. -/// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -/// N is the number of particles -/// self_mobility is 1./(6.*PI*eta*a) -/// a is the particle radius -/// mobility is the mobility matrix which will be retruned -#define mydebug(str,...) -// if (threadIdx.x < 3 && (blockIdx.x == 0 || blockIdx.x == 1)){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);} -__global__ void sd_compute_mobility_matrix(const real * r, int N, real self_mobility, real a, real * mobility){ - real mypos[3]; - const int lda=((3*N+31)/32)*32; - __shared__ real cachedPos[3*numThreadsPerBlock]; - __shared__ real writeCache[3*numThreadsPerBlock]; - int i = blockIdx.x*blockDim.x + threadIdx.x; - // get data for myposition - using coalscaled memory access - for (int l=0;l<3;l++){ - mydebug(" 0x%08x -> 0x%08x \n",numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x,numThreadsPerBlock*l+threadIdx.x); - cachedPos[numThreadsPerBlock*l+threadIdx.x] = r[numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x]; - } - __syncthreads(); - for (int l=0;l<3;l++){ - mypos[l] = cachedPos[threadIdx.x*3+l]; - mydebug("mypos[%d]: %e\n",l,mypos[l]); - } - - /*if (i < N){ - // first write the self contribution -#pragma unroll - for (int k=0; k < DIM; k++){ - //#pragma unroll - //for (int l=0; l < DIM; l++){ - //mobility[myindex(DIM*i+k,DIM*i+l)]=0; - //} - mobility[myindex(DIM*i+k,DIM*i+k)]=self_mobility; - } - }*/ - for (int offset=0;offset cutoff){ // drn < 2*a - t=0; - t2=0; - if (i==j){ - //t2=1-6./sqrt(M_PI)*xa+40./3./sqrt(M_PI)*xa*xa*xa; -#ifdef SD_USE_FLOAT - t2=1-3.385137501286537721688476709364635515064303775f*xa + 7.52252778063675049264105935414363447792067505f*xa3; -#else - t2=1-3.385137501286537721688476709364635515064303775*xa + 7.52252778063675049264105935414363447792067505*xa3; -#endif - } - } else { - // Rotne Prager - real xr2=xr*xr; - real ar2=ar*ar; -#ifdef SD_USE_FLOAT - t=(0.75f-1.5f*ar2)*ar*erfcf(xr)+(-4.f*xa3*xr2*xr2-3.f*xa*xr2+16.f*xa3*xr2+1.5f*xa-2.f *xa3-3.f*xa*ar2)*0.5641895835477562869480794515607725858440506f*exp(-xr2); - t2=(0.75f+0.5f*ar2)*ar*erfcf(xr)+(4.f*xa3*xr2*xr2+3.f*xa*xr2-20.f*xa3*xr2-4.5f*xa+14.f*xa3+ xa*ar2)*0.5641895835477562869480794515607725858440506f*exp(-xr2); -#else - //assert(erfc(xr)>1e-15); - t=(0.75-1.5*ar2)*ar*erfc(xr); - //assert(t>1e-15); - real tmp1=(-4.*xa3*xr2*xr2-3.*xa*xr2+16.*xa3*xr2+1.5*xa-2. *xa3-3.*xa*ar2); - real tmp2=0.5641895835477562869480794515607725858440506*exp(-xr2);// - //assert(abs(tmp1)>1e-15); - //assert(abs(tmp2)>1e-15); - //assert(abs(tmp1*tmp2)>1e-15); - t+=tmp1*tmp2; - //assert(abs(t)>1e-15); - t2=(0.75+0.5*ar2)*ar*erfc(xr)+(4.*xa3*xr2*xr2+3.*xa*xr2-20.*xa3*xr2-4.5*xa+14.*xa3+ xa*ar2)*0.5641895835477562869480794515607725858440506*exp(-xr2); -#endif - //assert(t>1e-15); - //assert(t2>1e-15); - } - t*=self_mobility; - t2*=self_mobility; - t/=dr2; - real tmp_el13; -#pragma unroll 3 - for (int k=0; k < DIM; k++){ - if (k ==0){ // these ifs should be removed at compile time ... after unrolling -#pragma unroll 3 - for (int l=0;l < 3; l++){ - writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t; - } - } - else if(k==1){ - tmp_el13 = writeCache[3*threadIdx.x+2]; - writeCache[3*threadIdx.x+0]=writeCache[3*threadIdx.x+1]; -#pragma unroll 2 - for (int l=1;l < DIM; l++){ - writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t; - } - } - else{ - writeCache[3*threadIdx.x+0]=tmp_el13; - writeCache[3*threadIdx.x+1]=writeCache[3*threadIdx.x+2]; - writeCache[3*threadIdx.x+2]=dr[k]*dr[2]*t; - } - writeCache[3*threadIdx.x+k]+=t2; - - __syncthreads(); - int max = min(blockDim.x,N-blockDim.x*blockIdx.x); - for (int l=0;l<3;l++){ - mobility[(DIM*j+k)*lda+blockIdx.x*blockDim.x*3+max*l+threadIdx.x]=writeCache[max*l+threadIdx.x]; - } - } - } // for (j = ... - } // if (i < N) - }// for offset = ... -} -#undef mydebug - - -/// This computes the farfield contribution of the mobility with ewald summation -/// this kernel computes the sines and cosinus. -/// r : the vector of positions [x_1, y_1, z_1, x_2, y_2, z_2, ...] -/// N : the number of particles -/// vecs * : a pointer to the k-vectors -/// num : number of k-vectors -/// sines : the values for sin(pos_i k_j) -/// cosines : the values for cos(pos_i k_j) -/// ldd : the leading dimension of the sines and cosines array -#define mydebug(str,...) -// if (threadIdx.x < 3 && (blockIdx.x == 0 || blockIdx.x == 1)){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);} -__global__ void sd_compute_mobility_sines(const real * r, int N, const real * vecs, int num, real * sines, real * cosines, int ldd){ - real mypos[3]; - __shared__ real cache[3*numThreadsPerBlock]; - int i = blockIdx.x*blockDim.x + threadIdx.x; - // get data for myposition - using coalscaled memory access - for (int l=0;l<3;l++){ - mydebug(" 0x%08x -> 0x%08x \n",numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x,numThreadsPerBlock*l+threadIdx.x); - cache[numThreadsPerBlock*l+threadIdx.x] = r[numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x]; - } - __syncthreads(); - for (int l=0;l<3;l++){ - mypos[l] = cache[threadIdx.x*3+l]; - mydebug("mypos[%d]: %e\n",l,mypos[l]); - } - - for (int offset=0;offset num){ - offset_next=num; - } - // j: wave vector index - for (int j=offset;j< offset_next ;j++){ - sine = sines [i + j*ldd]; - //sine = read_without_caching(sines + i+j*ldd); -#pragma unroll 3 - for (int k=0;k < 3;k++){ - real tmp = sin_sum[k + j*3]; - //assert(tmp <= max); - myout[k]+=sine*tmp; - } - cosine = cosines[i + j*ldd]; - //cosine = read_without_caching(cosines + i+j*ldd); -#pragma unroll 3 - for (int k=0;k= N || i ==j || j >= N){ - ; - } - else if (dr2 < 4*a*4*a){ - if (!(2*a*2*a < dr2 )){ - atomicAdd(myInfo,1); // count overlapping particles - } - else {// 2*a < drn < 4*a - interactions++; - // python code: - // # Use only singular therms, namely to order O(s_ij^0) - // T=(1./4./s-1/4-9./40.*ls)*dr*drt/dr2 - // # ^ this additonal constant is so that the mobility is smooth - // # c.f. N.-Q. Nguyen and A. J. C. Ladd, PHYSICAL REVIEW E 66, 046708 (2002) equation (34) - // T+=1./6.*ls*(-one+dr*drt/dr2) - // R[3*i:3*i+3,3*j:3*j+3]=-T - // R[3*i:3*i+3,3*i:3*i+3]+=T - real drn= sqrt(dr2); // length of dr - real s = drn/a-2; - real ls = log(s); - - real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.); -#ifdef SD_RESISTANCE_CORRECT - real para_fac =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls); - real perp_fac =((1./6.)*ls); -#else - real const perp_fac_c=1./6.*log(2.); - real para_fac =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls-para_fac_c)/dr2/self_mobility; - real perp_fac =((1./6.)*ls-perp_fac_c)/self_mobility; - diag_fac = perp_fac; - offdiag_fac = para_fac-perp_fac; -#endif -#ifdef SD_RESISTANCE_CORRECT - real dr4=dr2*dr2; - real dr6=dr4*dr2; - // constants for correction at cutoff - const real dr_c1 = 4; - const real dr_c2 = 4*4; - const real dr_c3 = 4*4*4; - const real dr_c4 = 4*4*4*4; - const real dr_c5 = 4*4*4*4*4; - const real dr_c6 = 4*4*4*4*4*4; - const real r2bcorr_para_self_c = 1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c; - const real r2bcorr_para_mix_c = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.) + para_fac_c; - const real r2bcorr_perp_self_c = 1./(1.-25./16./dr_c2) + 1./6.*log(2.); - const real r2bcorr_perp_mix_c = 1./(16./20.*dr_c1-25./20./dr_c1) + 1./6.*log(2.); - // TODO: Make sure to use (real) and not double ... - // real computation - real r2bcorr_para_self =-( para_fac + ( 1./(1.-9./4./dr2+3./dr4-1./dr6) - r2bcorr_para_self_c )); - real r2bcorr_para_mix = ( para_fac + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.) - r2bcorr_para_mix_c )); - real r2bcorr_perp_self =-( perp_fac + ( 1./(1.-((real)25./16.)/dr2) - r2bcorr_perp_self_c )); - real r2bcorr_perp_mix = ( perp_fac + ( 1./(16./20.*drn-25./20./drn) - r2bcorr_perp_mix_c )); - //printf("%d %d show [ %e, %e, %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix); - - r2bcorr_diag_self = (r2bcorr_perp_self)/self_mobility; - r2bcorr_diag_mix = (r2bcorr_perp_mix )/self_mobility; - r2bcorr_offdiag_self = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2; - r2bcorr_offdiag_mix = (r2bcorr_para_mix - r2bcorr_perp_mix ) /self_mobility/dr2; -#endif - } - } - if (i < N){ -#pragma unroll 3 - for (int k=0; k < DIM; k++){ -#pragma unroll 3 - for (int l=0;l < DIM; l++){ -#ifdef SD_RESISTANCE_CORRECT - resistance[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*r2bcorr_offdiag_mix; -#else - resistance[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*offdiag_fac; -#endif - } -#ifdef SD_RESISTANCE_CORRECT - myresistance[k]-=dr[k]*dr[k]*r2bcorr_offdiag_self; - resistance[myindex(DIM*i+k,DIM*j+k)]+=r2bcorr_diag_mix; - myresistance[k]-=r2bcorr_diag_self; -#else - myresistance[k]-=dr[k]*dr[k]*offdiag_fac; - resistance[myindex(DIM*i+k,DIM*j+k)]+=diag_fac; - myresistance[k]-=diag_fac; -#endif - } - } -#ifdef SD_RESISTANCE_CORRECT - myresistance[3]-=r2bcorr_offdiag_self*dr[0]*dr[1]; - myresistance[4]-=r2bcorr_offdiag_self*dr[0]*dr[2]; - myresistance[5]-=r2bcorr_offdiag_self*dr[1]*dr[2]; -#else - myresistance[3]-=offdiag_fac*dr[0]*dr[1]; - myresistance[4]-=offdiag_fac*dr[0]*dr[2]; - myresistance[5]-=offdiag_fac*dr[1]*dr[2]; -#endif - // python implementation: - //T=one*(1-9./32.*drn/a)+3./32.*dr*drt/drn/a; - } - } - if ( i < N){ -#pragma unroll - for (int k=0;k<3;k++){ - resistance[myindex(DIM*i+k,DIM*i+k)]=myresistance[k]; - } - resistance[myindex(DIM*i+0,DIM*i+1)]=myresistance[3]; - resistance[myindex(DIM*i+1,DIM*i+0)]=myresistance[3]; - resistance[myindex(DIM*i+0,DIM*i+2)]=myresistance[4]; - resistance[myindex(DIM*i+2,DIM*i+0)]=myresistance[4]; - resistance[myindex(DIM*i+1,DIM*i+2)]=myresistance[5]; - resistance[myindex(DIM*i+2,DIM*i+1)]=myresistance[5]; - } - __syncthreads(); - int * sharedInteractions = (int *) cachedPos; // reuse shared memory - int * maxInteractions = sharedInteractions + blockDim.x*2; - sharedInteractions[threadIdx.x]=interactions; - sharedInteractions[threadIdx.x+blockDim.x]=0; - maxInteractions[threadIdx.x] =interactions; - maxInteractions[threadIdx.x+blockDim.x] =0; - for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){ - if (threadIdx.x < t){ - sharedInteractions[threadIdx.x]+=sharedInteractions[threadIdx.x+t]; - sharedInteractions[threadIdx.x+t]=0; - maxInteractions[threadIdx.x]=max(maxInteractions[threadIdx.x+t],maxInteractions[threadIdx.x]); - } - __syncthreads(); - } - if (threadIdx.x==0){ - sharedInteractions[0]+=sharedInteractions[1]; - atomicAdd(myInfo+1, sharedInteractions[0]); - maxInteractions[0]=max(maxInteractions[0],maxInteractions[1]); - atomicMax(myInfo+2, maxInteractions[0]); - } -} - -/// Find particle pairs which are within a certain cutoff range -/// pos : is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -/// L_g : is the boxlength -/// N : is the number of particles -/// interacting : list of interaction particles. First entry is the first -/// particle close enough to particle 1. Followed by the first -/// one close enough to the second and so on. -/// The second interaction particle of the first is at -/// lda_short=((N+31)/32)*32 stored. -/// num_interacting : for each particle the number of interacting particles. -/// interaction_range : the cutoff range for interaction. Has to be smaller -/// than half the boxlength. -__global__ void sd_find_interacting_particles(const real * pos,const real * L_g, const int N, int * interacting, int * num_interacting, - const real interaction_range){ - const int lda_short=((N+31)/32)*32; - const real interaction_range_squared=SQR(interaction_range); - __shared__ real cachedPos[3*numThreadsPerBlock]; -#ifndef SD_NOT_PERIODIC - __shared__ real L[3]; - if (threadIdx.x < 3){ - L[threadIdx.x]=L_g[threadIdx.x]; - } -#endif - int i = blockIdx.x*blockDim.x + threadIdx.x; - for (int l=0;l<3;l++){ - cachedPos[threadIdx.x+l*blockDim.x] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3]; - } - __syncthreads(); - real mypos[3]; - for (int l=0;l<3;l++){ - mypos[l]=cachedPos[threadIdx.x*3+l]; - } - if (i < N){ - int interactions=0; - int j0next; - for (int j0=0;j0=0 && current_bucket < total_bucket_num)); - int in_loop=particle_count[current_bucket]; - for (int j_loop=0;j_loop=interactions){ - interacting[i+j*lda_short]=N+1; - } - } - bool swapped; - do { - swapped=false; - int last=interacting[i]; - int cur; - for (int j=1;j cur){ - //swap - interacting[i+lda_short*(j-1)]=cur; - interacting[i+lda_short*j] =last; - swapped=true; - } else { - last=cur; - } - } - loop--; - } while (swapped); - }*/ - if ( i < N){ - int loop = interactions; - do { - int last=interacting[i]; - int cur; - int last_j=0; - for (int j=1;j cur){ - //swap - interacting[i+lda_short*(j-1)]=cur; - interacting[i+lda_short*j] =last; - last_j=j; - } else { - last=cur; - } - } - loop=last_j; - } while (loop>0); - } - -} - -/// Compute the lubrication correction in the resistance formulation. -/// pos : is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -/// N : is the number of particles -/// self_mobility : is 1./(6.*PI*eta*a) -/// a : is the particle radius -/// _L : is the boxlength -/// resistance : is the sparse resistance matrix which will be returned -/// col_idx : list of interacting partilces -/// row_l : number of col_idx entries per particle -/// myInfo : contains infos about the operation: -/// myInfo[0] : number of overlapping particles -/// myInfo[1] : number of interacting particles (via nf) -/// myInfo[2] : max number of interacting particles per particle -__global__ void sd_compute_resistance_matrix_sparse(const real * pos, const int N, const real self_mobility,const real a,const real * _L, - real * resistance,const int * col_idx,const int * row_l, int * myInfo){ - const int lda_short=((N+31)/32)*32; - const int lda = ((N*3+31)/32)*32; -#ifdef SD_USE_FLOAT - __shared__ real cachedPos[4*numThreadsPerBlock]; -#else - __shared__ real cachedPos[3*numThreadsPerBlock]; -#endif - // copy small vectors to shared memory -#ifndef SD_NOT_PERIODIC - __shared__ real L[3]; - if (threadIdx.x < 3){ - L[threadIdx.x]=_L[threadIdx.x]; - } -#endif - real myresistance[6]={0,0,0,0,0,0}; - int i = blockIdx.x*blockDim.x + threadIdx.x; - // get data for myposition - but coalscaled -#pragma unroll 3 - for (int l=0;l<3;l++){ - cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3]; - } - - __syncthreads(); - - int num_interactions=i= N || i ==j || j >= N){ - ; - } - else if (dr2 < 4*a*4*a){ - if (!(2*a*2*a < dr2 )){ - if (i==j){ - i_index=j_loop; - } else { - atomicAdd(myInfo,1); // count overlapping particles - } - } - else {// 2*a < drn < 4*a - // python code: - // # Use only singular therms, namely to order O(s_ij^0) - // T=(1./4./s-1/4-9./40.*ls)*dr*drt/dr2 - // # ^ this additonal constant is so that the mobility is smooth - // # c.f. N.-Q. Nguyen and A. J. C. Ladd, PHYSICAL REVIEW E 66, 046708 (2002) equation (34) - // T+=1./6.*ls*(-one+dr*drt/dr2) - // R[3*i:3*i+3,3*j:3*j+3]=-T - // R[3*i:3*i+3,3*i:3*i+3]+=T - real drn= sqrt(dr2); // length of dr - real s = drn/a-2; - real ls = log(s); - - real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.); -#ifdef SD_RESISTANCE_CORRECT - real para_fac =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls); - real perp_fac =((1./6.)*ls); -#else - real const perp_fac_c=1./6.*log(2.); - real para_fac =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls-para_fac_c)/dr2/self_mobility; - real perp_fac =((1./6.)*ls-perp_fac_c)/self_mobility; - diag_fac = perp_fac; - offdiag_fac = para_fac-perp_fac; -#endif -#ifdef SD_RESISTANCE_CORRECT - real dr4=dr2*dr2; - real dr6=dr4*dr2; - // constants for correction at cutoff - const real dr_c1 = 4; - const real dr_c2 = 4*4; - const real dr_c3 = 4*4*4; - const real dr_c4 = 4*4*4*4; - const real dr_c5 = 4*4*4*4*4; - const real dr_c6 = 4*4*4*4*4*4; - - const real r2bcorr_para_self_c = 1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c; - const real r2bcorr_para_mix_c = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.) + para_fac_c; - const real r2bcorr_perp_self_c = 1./(1.-25./16./dr_c2) + 1./6.*log(2.); - const real r2bcorr_perp_mix_c = 1./(16./20.*dr_c1-25./20./dr_c1) + 1./6.*log(2.); - // TODO: Make sure to use (real) and not double ... - // real computation - real r2bcorr_para_self =-( para_fac + ( 1./(1.-9./4./dr2+3./dr4-1./dr6) - r2bcorr_para_self_c )); - real r2bcorr_para_mix = ( para_fac + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.) - r2bcorr_para_mix_c )); - real r2bcorr_perp_self =-( perp_fac + ( 1./(1.-((real)25./16.)/dr2) - r2bcorr_perp_self_c )); - real r2bcorr_perp_mix = ( perp_fac + ( 1./(16./20.*drn-25./20./drn) - r2bcorr_perp_mix_c )); - //printf("%d %d show [ %e, %e, %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix); - - r2bcorr_diag_self = (r2bcorr_perp_self)/self_mobility; - r2bcorr_diag_mix = (r2bcorr_perp_mix )/self_mobility; - r2bcorr_offdiag_self = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2; - r2bcorr_offdiag_mix = (r2bcorr_para_mix - r2bcorr_perp_mix ) /self_mobility/dr2; -#endif - } - } - if (i < N){ -#pragma unroll 3 - for (int k=0; k < DIM; k++){ -#pragma unroll 3 - for (int l=0;l < DIM; l++){ -#ifdef SD_RESISTANCE_CORRECT - resistance[DIM*i+k+(DIM*j_loop+l)*lda]=dr[k]*dr[l]*r2bcorr_offdiag_mix; -#else - resistance[DIM*i+k+(DIM*j_loop+l)*lda]=dr[k]*dr[l]*offdiag_fac; -#endif - } -#ifdef SD_RESISTANCE_CORRECT - myresistance[k]-=dr[k]*dr[k]*r2bcorr_offdiag_self; - resistance[DIM*i+k+(DIM*j_loop+k)*lda]+=r2bcorr_diag_mix; - myresistance[k]-=r2bcorr_diag_self; -#else - myresistance[k]-=dr[k]*dr[k]*offdiag_fac; - resistance[DIM*i+k+(DIM*j_loop+k)*lda]+=diag_fac; - myresistance[k]-=diag_fac; -#endif - } - } -#ifdef SD_RESISTANCE_CORRECT - myresistance[3]-=r2bcorr_offdiag_self*dr[0]*dr[1]; - myresistance[4]-=r2bcorr_offdiag_self*dr[0]*dr[2]; - myresistance[5]-=r2bcorr_offdiag_self*dr[1]*dr[2]; -#else - myresistance[3]-=offdiag_fac*dr[0]*dr[1]; - myresistance[4]-=offdiag_fac*dr[0]*dr[2]; - myresistance[5]-=offdiag_fac*dr[1]*dr[2]; -#endif - // python implementation: - //T=one*(1-9./32.*drn/a)+3./32.*dr*drt/drn/a; - } - - /*else{ // set the block to zero - // it might be faster to set everything in the beginning to zero ... - // or use sparse matrices ... - #pragma unroll 3 - for (int k=0; k < DIM; k++){ - #pragma unroll 3 - for (int l=0;l < DIM; l++){ - resistance[myindex(DIM*i+k,DIM*j+l)]=0; - } - } - }*/ - - if ( i < N){ - resistance[DIM*i+0+(DIM*i_index+0)*lda]=myresistance[0]; - resistance[DIM*i+0+(DIM*i_index+1)*lda]=myresistance[3]; - resistance[DIM*i+0+(DIM*i_index+2)*lda]=myresistance[4]; - resistance[DIM*i+1+(DIM*i_index+0)*lda]=myresistance[3]; - resistance[DIM*i+1+(DIM*i_index+1)*lda]=myresistance[1]; - resistance[DIM*i+1+(DIM*i_index+2)*lda]=myresistance[5]; - resistance[DIM*i+2+(DIM*i_index+0)*lda]=myresistance[4]; - resistance[DIM*i+2+(DIM*i_index+1)*lda]=myresistance[5]; - resistance[DIM*i+2+(DIM*i_index+2)*lda]=myresistance[2]; - } - /* - __syncthreads(); - int * sharedInteractions = (int *) cachedPos; // reuse shared memory - int * maxInteractions = sharedInteractions + blockDim.x*2; - sharedInteractions[threadIdx.x]=num_iterations-1; - sharedInteractions[threadIdx.x+blockDim.x]=0; - maxInteractions[threadIdx.x] =num_iterations-1; - maxInteractions[threadIdx.x+blockDim.x] =0; - for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){ - if (threadIdx.x < t){ - sharedInteractions[threadIdx.x]+=sharedInteractions[threadIdx.x+t]; - sharedInteractions[threadIdx.x+t]=0; - maxInteractions[threadIdx.x]=max(maxInteractions[threadIdx.x+t],maxInteractions[threadIdx.x]); - } - __syncthreads(); - } - if (threadIdx.x==0){ - sharedInteractions[0]+=sharedInteractions[1]; - atomicAdd(myInfo+1, sharedInteractions[0]); - maxInteractions[0]=max(maxInteractions[0],maxInteractions[1]); - atomicMax(myInfo+2, maxInteractions[0]); - }*/ -} - - -#ifndef SD_RESISTANCE_CORRECT -#warning "SD Brownian motion only support corrected resistance calculation ..." -#endif -/// Computing the brownian forces which arise from the nearfield contribution -/// of the mobility matrix -/// r : position of the particles -/// gaussian : pointer to a sufficent amount of gaussian distributed random -/// numbers -/// N : number of particles -/// L_g : size of periodic box -/// a : hydrodynamic particle radius -/// self_mobility : is 1./(6.*PI*eta*a) -/// brownian_force_nf : computed brownian force -__global__ void sd_compute_brownian_force_nearfield(const real * r, const real * gaussian,int N,const real * L_g, - real a, real self_mobility,real * brownian_force_nf){ - const int gaussian_ldd=((N+31)/32)*32; - int interactions=0; - real mypos[3]; - real writeCache[6]; - //real otherWriteCache[3]; - __shared__ real cachedPos[3*numThreadsPerBlock]; - __shared__ real choleskyCache[12*numThreadsPerBlock]; - //const int lda=(((N*3)+31)/32)*32; - //__shared__ real myresistance[6*numThreadsPerBlock]; - //real myresistance[6]; - //__shared__ real otherresistance[6*numThreadsPerBlock]; - int i = blockIdx.x*blockDim.x + threadIdx.x; -#ifndef SD_NOT_PERIODIC - __shared__ real L[3]; - if (threadIdx.x < 3){ // copy L to shared memory - L[threadIdx.x]=L_g[threadIdx.x]; - } -#endif - for (int l=0;l<3;l++){ - cachedPos[threadIdx.x+l*numThreadsPerBlock] = r[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3]; - writeCache[l]= 0; - } - __syncthreads(); - for (int d=0;d<3;d++){ - mypos[d] = cachedPos[threadIdx.x*3+d]; - } - - for (int offset=0;offset= N || i >= j || j >= N){ - writeCache[3]=0; - writeCache[4]=0; - writeCache[5]=0; - } - // j > i - else if (dr2 < 16 && 4 < dr2 ){// 2*a < drn < 4*a - wasInLoop = 1; - { - real drn= sqrt(dr2); // length of dr - real s = drn-2; - real ls = log(s); - - real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.); - real para_fac =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls); - real perp_fac =((1./6.)*ls); - - real dr4=dr2*dr2; - real dr6=dr4*dr2; - // constants for correction at cutoff - const real dr_c1 = 4; - const real dr_c2 = 4*4; - const real dr_c3 = 4*4*4; - const real dr_c4 = 4*4*4*4; - const real dr_c5 = 4*4*4*4*4; - const real dr_c6 = 4*4*4*4*4*4; - - const real r2bcorr_para_self_c = 1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c; - const real r2bcorr_para_mix_c = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.) + para_fac_c; - const real r2bcorr_perp_self_c = 1./(1.-25./16./dr_c2) + 1./6.*log(2.); - const real r2bcorr_perp_mix_c = 1./(16./20.*dr_c1-25./20./dr_c1) + 1./6.*log(2.); - // TODO: Make sure to use (real) and not double ... - // real computation - real r2bcorr_para_self =-( para_fac + ( 1./(1.-9./4./dr2+3./dr4-1./dr6) - r2bcorr_para_self_c )); - real r2bcorr_para_mix = ( para_fac + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.) - r2bcorr_para_mix_c )); - real r2bcorr_perp_self =-( perp_fac + ( 1./(1.-((real)25./16.)/dr2) - r2bcorr_perp_self_c )); - real r2bcorr_perp_mix = ( perp_fac + ( 1./(16./20.*drn-25./20./drn) - r2bcorr_perp_mix_c )); - //printf("%d %d show [ %e, %e, %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix); - - r2bcorr_diag_self = (r2bcorr_perp_self)/self_mobility; - r2bcorr_diag_mix = (r2bcorr_perp_mix )/self_mobility; - r2bcorr_offdiag_self = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2; - r2bcorr_offdiag_mix = (r2bcorr_para_mix - r2bcorr_perp_mix ) /self_mobility/dr2; - //printf("%d %d shoz [%e, %e, %e, %e]\n",i,j,r2bcorr_diag_self,r2bcorr_offdiag_self,r2bcorr_diag_mix,r2bcorr_offdiag_mix); - //r2bcorr_offdiag_self /= dr2; - //r2bcorr_offdiag_mix /= dr2; - } - // This is the cholesky decomposition. - // note that we try to avoid the usage of registers, so we use shared mem - // myCC is a makro, defined here to shorten the lines: -#define myCC(pos) choleskyCache[threadIdx.x+ (pos)*numThreadsPerBlock] - // without it would look more like this: - //choleskyCache[threadIdx.x+ 0*numThreadsPerBlock] = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0]); - //choleskyCache[threadIdx.x+ 1*numThreadsPerBlock] = r2bcorr_offdiag_self*dr[0]*dr[1] / choleskyCache[threadIdx.x+ 0*numThreadsPerBlock]; - // L_{1,1} to L_{6,1} - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0]); - myCC(1) = r2bcorr_offdiag_self*dr[0]*dr[1] / myCC(0); - myCC(2) = r2bcorr_offdiag_self*dr[0]*dr[2] / myCC(0); - myCC(3) = (r2bcorr_diag_mix + r2bcorr_offdiag_mix *dr[0]*dr[0])/ myCC(0); - myCC(4) = r2bcorr_offdiag_mix *dr[0]*dr[1] / myCC(0); - myCC(5) = r2bcorr_offdiag_mix *dr[0]*dr[2] / myCC(0); - - writeCache[0]+=myCC(0) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[1]+=myCC(1) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[2]+=myCC(2) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[3] =myCC(3) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[4] =myCC(4) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[5] =myCC(5) * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used: 6 - // L_{2,2} to L_{6,2} - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[1]*dr[1] - SQR( myCC(1))); - myCC(6) = (r2bcorr_offdiag_self*dr[1]*dr[2] - myCC(2)*myCC(1))/myCC(0); - myCC(7) = (r2bcorr_offdiag_mix *dr[1]*dr[0] - myCC(3)*myCC(1))/myCC(0); - myCC(8) = (r2bcorr_diag_mix +r2bcorr_offdiag_mix *dr[1]*dr[1] - myCC(4)*myCC(1))/myCC(0); - myCC(9) = (r2bcorr_offdiag_mix *dr[1]*dr[2] - myCC(5)*myCC(1))/myCC(0); - writeCache[1]+=myCC(0) * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[2]+=myCC(6) * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[3]+=myCC(7) * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[4]+=myCC(8) * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[5]+=myCC(9) * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used: 11 - 1 - // L_{3,3} to L_{6,3} - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[2]*dr[2] - SQR( myCC(2))- SQR( myCC(6))); - myCC(1) = (r2bcorr_offdiag_mix *dr[2]*dr[0] - myCC(3)*myCC(2) - myCC(7)*myCC(6))/myCC(0); - myCC(10) = (r2bcorr_offdiag_mix *dr[2]*dr[1] - myCC(4)*myCC(2) - myCC(8)*myCC(6))/myCC(0); - myCC(11) = (r2bcorr_diag_mix +r2bcorr_offdiag_mix *dr[2]*dr[2] - myCC(5)*myCC(2) - myCC(9)*myCC(6))/myCC(0); - writeCache[2]+=myCC(0) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[3]+=myCC(1) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[4]+=myCC(10) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[5]+=myCC(11) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used: 15 - 3 - // L_{4,4} to L_{6,4} - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0] - SQR( myCC(3))- SQR( myCC(7)) - - SQR( myCC(1))); - if (isnan(myCC(0))){ - printf("%d %d : [%e %e %e]\n",i,j,dr[0],dr[1],dr[2]); - } - myCC(2) = (r2bcorr_offdiag_self*dr[0]*dr[1] - myCC(4)*myCC(3) - myCC(8)*myCC(7) - - myCC(10)*myCC(1))/myCC(0); - myCC(6) = (r2bcorr_offdiag_self*dr[0]*dr[2] - myCC(5)*myCC(3) - myCC(9)*myCC(7) - - myCC(11)*myCC(1))/myCC(0); - writeCache[3]+=myCC(0) * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[4]+=myCC(2) * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[5]+=myCC(6) * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used: 18 - 6 - // L_{5,5} and L_{6,5} - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[1]*dr[1] - SQR( myCC(4))- SQR( myCC(8)) - - SQR( myCC(10))- SQR( myCC(2))); - if (isnan(myCC(0))){ - printf("%d %d : [%e %e %e] \n",i,j,dr[0],dr[1],dr[2]); - } - myCC(3) = (r2bcorr_offdiag_self*dr[1]*dr[2] - myCC(5)*myCC(4) - myCC(9)*myCC(8) - - myCC(11)*myCC(10) - myCC(6)*myCC(2))/myCC(0); - writeCache[4]+=myCC(0) * gaussian[4*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - writeCache[5]+=myCC(3) * gaussian[4*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used: 20 - 10 - // L_{6,6} would be: - myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[2]*dr[2] - SQR(myCC(5)) - SQR(myCC(9)) - - SQR(myCC(11)) - SQR(myCC(6)) - SQR(myCC(3))); - if (isnan(myCC(0))){ - printf("%d %d : [%e %e %e] \n",i,j,dr[0],dr[1],dr[2]); - } - writeCache[5]+=myCC(0) * gaussian[5*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions]; - // used 21 - 15 - interactions++; - for (int l=0;l<6;l++){ - if (isnan(writeCache[l])){ - printf("%d %d, %d: dr=[%e, %e, %e]\n",i,j,l,dr[0],dr[1],dr[2]); - } - } - } - // for the particle j (writeCache[3-5]) we can reduce localy: - - int * haveInteraction = (int *) choleskyCache+6*numThreadsPerBlock; // reuse shared memory - choleskyCache[threadIdx.x+0*numThreadsPerBlock]=writeCache[3]; - choleskyCache[threadIdx.x+1*numThreadsPerBlock]=0; - choleskyCache[threadIdx.x+2*numThreadsPerBlock]=writeCache[4]; - choleskyCache[threadIdx.x+3*numThreadsPerBlock]=0; - choleskyCache[threadIdx.x+4*numThreadsPerBlock]=writeCache[5]; - choleskyCache[threadIdx.x+5*numThreadsPerBlock]=0; - haveInteraction[threadIdx.x]=wasInLoop; - haveInteraction[threadIdx.x+numThreadsPerBlock]=0; - for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){ - if (threadIdx.x < t){ - choleskyCache[threadIdx.x]+=choleskyCache[threadIdx.x+t]; - choleskyCache[threadIdx.x+2*numThreadsPerBlock]+=choleskyCache[threadIdx.x+t +2*numThreadsPerBlock]; - choleskyCache[threadIdx.x+4*numThreadsPerBlock]+=choleskyCache[threadIdx.x+t +2*numThreadsPerBlock]; - haveInteraction[threadIdx.x]|=haveInteraction[threadIdx.x+t]; - choleskyCache[threadIdx.x+t]=0; - choleskyCache[threadIdx.x+t +2*numThreadsPerBlock]=0; - choleskyCache[threadIdx.x+t +4*numThreadsPerBlock]=0; - haveInteraction[threadIdx.x+t]=0; - } - __syncthreads(); - } - if (threadIdx.x==0){ - if (haveInteraction[0] || haveInteraction[1]){ - choleskyCache[0]+=choleskyCache[1]; - choleskyCache[2*numThreadsPerBlock]+=choleskyCache[1+2*numThreadsPerBlock]; - choleskyCache[4*numThreadsPerBlock]+=choleskyCache[1+4*numThreadsPerBlock]; - real tmp=atomicAdd(brownian_force_nf+j*3, choleskyCache[0]); - bool error=isnan(tmp); - tmp = atomicAdd(brownian_force_nf+j*3+1, choleskyCache[2*numThreadsPerBlock]); - error|=isnan(tmp); - tmp = atomicAdd(brownian_force_nf+j*3+2, choleskyCache[4*numThreadsPerBlock]); - error|=isnan(tmp); - if (error){ - printf("%d %d: dr=[%e, %e, %e]\n",i,j,dr[0],dr[1],dr[2]); - } - } - } - } - } - if ( i < N){ -#pragma unroll 3 - for (int k=0;k<3;k++){ - real tmp = atomicAdd(brownian_force_nf+i*3+k, writeCache[k]); - //assert(!isnan(tmp)); - } - } -} - -/// computing the matrix vector y = c * A * x product with a square sparse -/// matrix A and a scalar factor c -/// size : size of the matrix and of the input and output vectors -/// factor : factor $c$ is the given prefactor -/// matrix : data of the sparse matrix -/// ldd : leading dimension of the matrix -/// ldd_short : leading dimension of col_idx -/// col_idx : indices of the entries in the matrix -/// row_l : number of entris per row -/// vec_in : the input vecor $x$ -/// vec_out : the output vector $y$ -__global__ void sd_multiply_sparse_matrix_vector(const int size, const real factor, const real * matrix, const int ldd, const int ldd_short, - const int * col_idx, const int * row_l, const real * vec_in, real * vec_out ){ - int id = threadIdx.x + blockIdx.x*blockDim.x; - if (id DISP_MAX*DISP_MAX){ - disp2=DISP_MAX/sqrt(disp2); -#pragma unroll - for (int d=0;d<3;d++){ - disp_d[i+d]*=disp2; - } - } -} - -/// if SD_AVOID_COLLISION is set avoid that particles are overlapping after -/// the displacement, else only perform the integration step -/// r_d : position of the particles -/// disp_d : displacement of the particles -/// L : box dimensions -/// a : particle radius -/// N : number of particles -__global__ void sd_real_integrate( real * r_d , const real * disp_d, const real * L, real a, int N) -{ - int idx = blockIdx.x*blockDim.x + threadIdx.x; - // t is the factor how far of disp_d we will move. - // in case everything is fine, we will move t, if there is some trouble, - // we will move less to avoid collision - real t=1; -#ifdef SD_AVOID_COLLISON - real rnew[DIM]; - for (int d=0;dN){ - continue; - } - } - real dr2=0; - for (int d=0;d= 2.0 - x=__real2int_rd(tmp); - //x=floor(tmp); - x%=bucketNum[d]; - // avoid negativ numbers - x= (x < 0)?x+bucketNum[d]: x; - //x+=bucketNum[d]; - //x%=bucketNum[d]; - switch (d){ - case 0: - bucket.x = x; - break; - case 1: - bucket.y = x; - break; - case 2: - bucket.z = x; - break; - } - } - int myBucket = bucket.x + bucket.y*bucketNum[0] + bucket.z*bucketNum[0]*bucketNum[1]; - int num = atomicAdd(particleCount+myBucket, 1); - if (num < maxParticlePerCell){ - particleList[myBucket+num*totalBucketNum]=i; - particleToBucketList[i]=myBucket; - }else{ - // Note: printf in device code works only with c.c.>=2.0 -#if (__CUDA_ARCH__>=200) - printf("error: overflow in grid cell (%i,%i,%i)\n",bucket.x,bucket.y,bucket.z); -#endif - } - } -} - -/// This function reads the diagonal element entries of a square matrix -/// and stores them in the vector diag and the inverse of them in diag_i -/// size : size of the matrix -/// mat_a : the matrix -/// lda : leading dimension of the matrix mat_a -/// diag : the diagonal (OUT) -/// diag_i : the inverse of the diagonal (OUT) -__global__ void sd_get_diag(int size, const real * mat_a,int lda,real * diag,real * diag_i){ - const int stepw=lda+1; - for (int l=threadIdx.x+blockDim.x*blockIdx.x;l. -*/ - -#ifndef __INTEGRATE_SD_CUDA_KERNEL_HPP -#define __INTEGRATE_SD_CUDA_KERNEL_HPP - -#include "integrate_sd.hpp" - -#ifdef SD - -// This computes the farfield contribution. -// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -// N is the number of particles -// self_mobility is 1./(6.*PI*viscosity*radius) -// a is the particle radius -// mobility is the mobility matrix which will be retruned -// L is the boxlength -__global__ void sd_compute_mobility_matrix(const real * r, int N, real self_mobility, real a, real * mobility); - - - -/// This computes the farfield contribution of the mobility with ewald summation -/// this version assumes that the real-space cutoff is smaller than the boxlength -// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -// N is the number of particles -// self_mobility is 1./(6.*PI*eta*a) -// a is the particle radius -// mobility is the mobility matrix which will be retruned -// L_d is the boxlength -// cutoff the realspace cutoff distance -// xi the splitting parameter as defined by Beenakker 1986 -// xa = xi * a -// xa3 = xa * xa * xa -__global__ void sd_compute_mobility_matrix_real_short(const real * r, int N, real self_mobility, real a, const real * L_g, real * mobility, - real cutoff, real xi, real xa, real xa3); - -/// This computes the farfield contribution of the mobility with ewald summation -/// this kernel computes the sines and cosinus. -// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...] -// N is the number of particles -// vecs are the k vectors -// num is the number of k vectors -// cosines is the pointer where the cosines : cos( position \times k-vector) are saved -// sines the same for sines -// ldd the rounded number of particles -__global__ void sd_compute_mobility_sines(const real * r, int N, const real * vecs, int num, real * sines, real * cosines, int ldd); - -/// adds to each of the diagonal elemnts of the size*size matrix \param matrix -/// with lda \param lda 1 -__global__ void sd_add_identity_matrix(real * matrix, int size, int lda); - -void _cudaCheckError(const char *msg, const char * file, const int line); -// this computes the near field -// it calculates the ResistanceMatrix -__global__ void sd_compute_resistance_matrix(const real * r, int N, real self_mobility, real a, const real * L, real * resistance, int * myInfo); -// this computes the near field -// it calculates the Sparse ResistanceMatrix - -__global__ void sd_compute_resistance_matrix_sparse(const real * pos, const int N, const real self_mobility,const real a,const real * _L, - real * resistance,const int * col_idx,const int * row_l, int * myInfo); -// TODO: make the order of arguments uniform (and logical?) -// TODO: description here -__global__ void sd_compute_brownian_force_nearfield(const real * r,const real * gaussian_nf,int N,const real * L, real a, real self_mobility,real * brownian_force_nf); - - -/// This computes the farfield contribution of the mobility with ewald summation -/// this kernel computes the sines and cosinus. -// forces is the vector of [fx_1, fy_1, fz_1, fx_2, fy_2, fz_2, ...] -// a is the particle radius -// mobility is the mobility matrix which will be retruned (in/out) -// L_d is the boxlength -// cutoff the wavespace cutoff distance -__global__ void sd_wavepart_sum(const real * const forces, const real * const vecs, int num, const real * const matrices_d, - const real * const sines, const real * const cosines, int ldd, int N, real * sin_sum, real * cos_sum, real max); - - -__global__ void sd_wavepart_assemble(const int num, const real * const sines, const real * const cosines, const real * const sin_sum, - const real * const cos_sum, const int ldd, real * out, real max, int N, const real factor); - - -/// Add the wavepart contribution of the mobility to the matrix -__global__ void sd_wavepart_addto_matrix(const int num, const real * const matrices_d, const real * const sines, const real * const cosines, - const int ldd, const int N, real * const mobility_d, const int mat_ldd); - -// make sure to have one thread per particle -__global__ void sd_real_integrate_prepare( const real * r_d , real * disp_d, const real * L, real a, int N); -__global__ void sd_real_integrate( real * r_d , const real * disp_d, const real * L, real a, int N); - - -// this sets a block to zero -// matrix: pointer to the given matrix -// size : the size of the matrix (in the example below 3N) -// ldd : the leading dimension of the matrix -__global__ void sd_set_zero_matrix(real * matrix, int size, int ldd); - - -// this sets a block to zero -// data : pointer to the given data -// size : the size of the data -__global__ void sd_set_zero(real * data, int size); - -// this sets a block to an given integer -// data : pointer to the given data -// size : the size of the data -// value : the value written to the data block -__global__ void sd_set_value(int * data, int size, int value); - -// this sets a block to an given real -// data : pointer to the given data -// size : the size of the data -// value : the value written to the data block -__global__ void sd_set_value(real * data, int size, real value); - - -// This function reads the diagonal element entries of a matrix -// and stores them in the vector diag -// and the inverse of them in diag_i -__global__ void sd_get_diag(int size, const real * mat_a,int lda,real * diag,real * diag_i); - - - -// implementation of a bucket sort algorithm -// puts all the N particles with given position pos -// and particle radius a within the periodic boundary -// conditions of boxSize L_i = bucketSize_i * bucketNum_i -// puts them in the list particleList -// pos device array of particle position xyz -// bucketSize device array with the number of buckets in x y and z direction -// bucketNum device array with the size of a bucket in x y and z direction -// N number of particles -// particleCount device array of the numbers of particles per bucket. must be initalized to zero -// particleList device array of the partilces in each bucket -// maxParticlePerCell maximum particles per cell -// totalBucketNUm bucketNum[0]*bucketNum[1]*bucketNum[2] - the total number of buckets -__global__ void sd_bucket_sort( const real * pos , const real * bucketSize, const int * bucketNum, int N, int * particleCount, - int * particleList, int maxParticlePerCell, int totalBucketNum, int * particleToBucketList); - - -// matrix multiplication for sparse matrizes -// never use direct, use always wrapper -__global__ void sd_multiply_sparse_matrix_vector(const int size, const real factor, const real * matrix, const int ldd, const int ldd_short, - const int * col_idx, const int * row_l, const real * vec_in, real * vec_out ); - -// finding all interacting particles, given a certain iteraction range. -// this requires bucket sort and is therefor O(N) -// if interaction_range is > min(L)/2 particles can appeare more often in the list. -// self_interaction is excluded. -__global__ void sd_find_interacting_particles(const real * pos,const real * _L, const int N, int * interacting, int * num_interacting, - const int * particle_count, const int * particle_sorted_list, const real * bucket_size_, - const int * bucket_num_, const int * particle_to_bucket_list, const real interaction_range, - const int total_bucket_num); - -// finding all interacting particles, given a certain iteraction range. -// this works without bucket sort and is O(N^2) -__global__ void sd_find_interacting_particles(const real * pos,const real * L_g, const int N, int * interacting, int * num_interacting, - const real interaction_range); - - - -/// computing the inf norm of a given vector (or in other words the largest value) -/// if more than one block is used, this kernel does not perform the final reduction -/// size : size of the vector -/// vec : data of the vector -/// res : the 1 norm (OUT) -__global__ void sd_nrm_inf(const int size, const int * const vec, int * res); - - - -__global__ void sd_nrm1(const int size,const real * const vec, real * erg); - -#endif /* SD */ - -#endif diff --git a/src/core/integrate_sd_matrix.cu b/src/core/integrate_sd_matrix.cu deleted file mode 100644 index 5d670d58358..00000000000 --- a/src/core/integrate_sd_matrix.cu +++ /dev/null @@ -1,283 +0,0 @@ -/* - Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project - Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 - Max-Planck-Institute for Polymer Research, Theory Group - - This file is part of ESPResSo. - - ESPResSo is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - ESPResSo is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "config.hpp" - -#ifdef SD - -#include -#include -#include "cuda_runtime.h" - -#include "integrate_sd.hpp" -#include "integrate_sd_cuda.hpp" -#include "integrate_sd_matrix.hpp" - -#include - -#if defined(OMPI_MPI_H) || defined(_MPI_H) -#error CU-file includes mpi.h! This should not happen! -#endif - -/// constructor for wave part -/// _ldd_short : ldd_short of the matrix -wavepart::wavepart(int _ldd_short):ldd_short(_ldd_short){ - vecs = NULL; - matrices = NULL; - cosines = NULL; - sines = NULL; - num = 0; - max = 0; -#ifdef SD_DEBUG - vecs_hash = NULL; - matrices_hash = NULL; - cosines_hash = NULL; - sines_hash = NULL; -#endif -} - -/// destructor of the wave part -wavepart::~wavepart(){ - if (vecs){ - cuda_safe_mem(cudaFree((void*)vecs)); - vecs=NULL; - } - if (matrices){ - cuda_safe_mem(cudaFree((void*)matrices)); - matrices=NULL; - } - if (cosines){ - cuda_safe_mem(cudaFree((void*)cosines)); - cosines=NULL; - } - if (sines){ - cuda_safe_mem(cudaFree((void*)sines)); - sines=NULL; - } -#ifdef SD_DEBUG - if (vecs_hash){ - free(vecs_hash); - vecs_hash=NULL; - } - if (matrices_hash){ - free(matrices_hash); - matrices_hash=NULL; - } - if (sines_hash){ - free(sines_hash); - sines_hash=NULL; - } - if (cosines_hash){ - free(cosines_hash); - cosines_hash=NULL; - } -#endif - max=0; -} - - -/// free the wave part of the matrix -void matrix::_free_wavespace(){ - if (wavespace){ - //wavespace->~wavepart(); - //free((void*)wavespace); - delete wavespace; - wavespace=NULL; - } -} - -/// constructor of the matrix -matrix::matrix(){ - data=NULL; - col_idx=NULL; - row_l=NULL; - wavespace=NULL; - is_sparse=false; - size=0; - ldd=0; - ldd_short=0; -#ifdef SD_DEBUG - dense=NULL; - data_hash=NULL; - dense_hash=NULL; -#endif -} - -/// destructor of the matrix -matrix::~matrix(){ - if (data){ - cuda_safe_mem(cudaFree((void*)data)); - data=NULL; - } - if (col_idx){ - cuda_safe_mem(cudaFree((void*)col_idx)); - col_idx=NULL; - } - if (row_l){ - cuda_safe_mem(cudaFree((void*)row_l)); - row_l=NULL; - } -#ifdef SD_DEBUG - if (dense){ - cuda_safe_mem(cudaFree((void*)dense)); - dense=NULL; - } - if (data_hash){ - free(data_hash); - data_hash=NULL; - } - if (dense_hash){ - free(dense_hash); - dense_hash=NULL; - } -#endif - _free_wavespace(); -} - - -/// print some basic properties of the matrix, if SD_DEBUG is enabled -void matrix::printStats() const{ -#ifdef SD_DEBUG - printf("addr:%p\tsize:%d\tldd:%d\tsparse:%d",this,size,ldd,is_sparse); -#endif -} - -void getAndHash(const real * data, const int size, unsigned char result[MD5_DIGEST_LENGTH]){ -#ifdef SD_DEBUG - real host[size]; - cuda_safe_mem(cudaMemcpy( host, data, size*sizeof(real), cudaMemcpyDeviceToHost )); - MD5((unsigned char*) host, size, result); -#endif -} - -void matrix::hash_data(){ -#ifdef SD_DEBUG - if (data_hash == NULL){ - data_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - } - getAndHash(data, size*ldd, data_hash); -#endif -} - -void matrix::hash_dense(){ -#ifdef SD_DEBUG - if (dense_hash == NULL){ - dense_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - } - getAndHash(dense, size*ldd, dense_hash); -#endif -} - -void matrix::assert_all() const{ -#ifdef SD_DEBUG - unsigned char tmp[MD5_DIGEST_LENGTH]; - if (data_hash){ - getAndHash(data, size*ldd, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==data_hash[i]); - printf("%02x",tmp[i]); - } - } - if (dense_hash){ - getAndHash(dense, size*ldd, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==dense_hash[i]); - printf("%02x",tmp[i]); - } - } - if (wavespace){ - wavespace->assert_all(); - } -#endif -} - - -void wavepart::hash_vecs(){ -#ifdef SD_DEBUG - fprintf(stderr,"going to allocate %d bytes, vecs_hash is %p\n",MD5_DIGEST_LENGTH, vecs_hash); - if (vecs_hash == NULL){ - vecs_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - fprintf(stderr,"allocated %d bytes\n",MD5_DIGEST_LENGTH); - } - getAndHash(vecs, num*3 , vecs_hash); -#endif -} -void wavepart::hash_matrices(){ -#ifdef SD_DEBUG - if (matrices_hash == NULL){ - matrices_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - } - getAndHash(matrices, num*6, matrices_hash); -#endif -} -void wavepart::hash_cosines(){ -#ifdef SD_DEBUG - if (cosines_hash == NULL){ - cosines_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - } - getAndHash(cosines, num*ldd_short, cosines_hash); -#endif -} -void wavepart::hash_sines(){ -#ifdef SD_DEBUG - if (sines_hash == NULL){ - sines_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH); - } - getAndHash(sines, num*ldd_short, sines_hash); -#endif -} - -void wavepart::assert_all() const{ -#ifdef SD_DEBUG - unsigned char tmp[MD5_DIGEST_LENGTH]; - if (vecs_hash){ - getAndHash(vecs, 3*num, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==vecs_hash[i]); - printf("%02x",tmp[i]); - } - } - if (matrices_hash){ - getAndHash(matrices, 6*num, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==matrices_hash[i]); - printf("%02x",tmp[i]); - } - } - if (sines_hash){ - getAndHash(sines, num*ldd_short, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==sines_hash[i]); - printf("%02x",tmp[i]); - } - } - if (cosines_hash){ - getAndHash(cosines, num*ldd_short, tmp); - for (int i=0; i < MD5_DIGEST_LENGTH; i++){ - assert (tmp[i]==cosines_hash[i]); - printf("%02x",tmp[i]); - } - } -#endif -} - -#endif /* SD */ - diff --git a/src/core/integrate_sd_matrix.hpp b/src/core/integrate_sd_matrix.hpp deleted file mode 100644 index 30a6fe5f954..00000000000 --- a/src/core/integrate_sd_matrix.hpp +++ /dev/null @@ -1,72 +0,0 @@ -#pragma once - -/// stuct for sparse matrix: (blocked version off ELLPACK-R, see \link http://elib.uni-stuttgart.de/opus/volltexte/2010/5033/pdf/DIP_2938.pdf \endlink -/// for dense matrizes use only data - -struct wavepart{ -public: - real * vecs; - real * matrices; - real * cosines; - real * sines; - int num; - int max; - int & ldd_short; - wavepart(int _ldd_short); - ~wavepart(); -private: - wavepart(wavepart & other); -public: - void print( int N, int ldd) const; - void hash_vecs(); - void hash_matrices(); - void hash_cosines(); - void hash_sines(); - void assert_all() const; -#ifdef SD_DEBUG -private: - unsigned char * vecs_hash; - unsigned char * matrices_hash; - unsigned char * cosines_hash; - unsigned char * sines_hash; -#endif -}; - - - -struct matrix{ - //public: - real * data; - int * col_idx; - int * row_l; - wavepart * wavespace; - bool is_sparse; - int size; - int ldd; - int ldd_short; - void _init(); - matrix(); - ~matrix(); - void _free_wavespace(); - void _free(); - void printWavespace() const; - void print() const; - void printStats() const; -private: - matrix(matrix & other); // disable copy operator -#ifdef SD_DEBUG - unsigned char * data_hash; - unsigned char * dense_hash; -#endif -public: - real * dense; - void hash_data(); - void hash_dense(); - void assert_all() const; -}; - - -#ifdef SD_DEBUG - void getAndHash(const real * data, const int size, unsigned char * result); -#endif - diff --git a/src/core/thermostat.hpp b/src/core/thermostat.hpp index f36735bf0a5..e3b93fdcb69 100755 --- a/src/core/thermostat.hpp +++ b/src/core/thermostat.hpp @@ -48,8 +48,6 @@ #define THERMO_INTER_DPD 16 #define THERMO_GHMC 32 #define THERMO_CPU 64 -#define THERMO_SD 128 -#define THERMO_BD 256 /*@}*/ // Handle switching of noise function flat vs Gaussian