diff --git a/INSTALL.swift b/INSTALL.swift index 6aba15db9d..239149fc31 100644 --- a/INSTALL.swift +++ b/INSTALL.swift @@ -150,6 +150,15 @@ before you can build it. least "--with-metis". ParMETIS is preferred over METIS when there is a choice. +- Scotch: + a build of the Scotch library should be used to + optimize the load between MPI nodes by mapping the decomposed domain + to the available compute. This should be found in the + standard installation directories, or pointed at using the + "--with-scotch" configuration options. + In this case the top-level installation directory of the build + should be given. + - libNUMA: a build of the NUMA library can be used to pin the threads to the physical core of the machine SWIFT is running on. This is diff --git a/Makefile.am b/Makefile.am index 3ca9fd5e74..3420209f2f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -48,8 +48,8 @@ EXTRA_LIBS = $(GSL_LIBS) $(HDF5_LIBS) $(FFTW_LIBS) $(NUMA_LIBS) $(PROFILER_LIBS) $(CHEALPIX_LIBS) # MPI libraries. -MPI_LIBS = $(PARMETIS_LIBS) $(METIS_LIBS) $(MPI_THREAD_LIBS) $(FFTW_MPI_LIBS) -MPI_FLAGS = -DWITH_MPI $(PARMETIS_INCS) $(METIS_INCS) $(FFTW_MPI_INCS) +MPI_LIBS = $(SCOTCH_LIBS) $(PARMETIS_LIBS) $(METIS_LIBS) $(MPI_THREAD_LIBS) $(FFTW_MPI_LIBS) +MPI_FLAGS = -DWITH_MPI $(SCOTCH_INCS) $(PARMETIS_INCS) $(METIS_INCS) $(FFTW_MPI_INCS) # Programs. bin_PROGRAMS = swift diff --git a/README-Scotch.md b/README-Scotch.md new file mode 100644 index 0000000000..dcf7550801 --- /dev/null +++ b/README-Scotch.md @@ -0,0 +1,193 @@ +Information on how to run SWIFT with Scotch mapping and the test environment used on Cosma 8. Code has been tested with Scotch version 7.0.2. + +Last update 18th August 2023. + + +Obtaining Scotch (as per gitlab [repo](https://gitlab.inria.fr/scotch/scotch)) +---------------- + +**Scotch** is publicly available under the CeCILL-C free software license, as described [here](https://gitlab.inria.fr/scotch/scotch/blob/master/LICENSE_en.txt). The license itself is available [here](https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/CeCILL-C_V1-en.txt). + +To use the lastest version of **Scotch**, please clone the master branch: + + git clone git@gitlab.inria.fr:scotch/scotch.git + +Tarballs of the **Scotch** releases are available [here](https://gitlab.inria.fr/scotch/scotch/-/releases). + +Instructions for installing locally on Cosma 8 +---------------- +_Environment_ +``` + module load cosma/2018 python/3.6.5 intel_comp/2022.1.2 compiler openmpi/4.1.4 fftw/3.3.9 parallel_hdf5/1.12.0 parmetis/4.0.3-64bit metis/5.1.0-64bit gsl/2.5 + module load cmake + module load bison +``` +Navigate to the Scotch directory and carry out the following commands + +``` + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=/path-to-install-dir .. + make -j5 + make install +``` + +Configure SWIFT with Scotch +---------------- + +Follow the usual installation [instructions](https://gitlab.cosma.dur.ac.uk/swift/swiftsim/-/blob/master/INSTALL.swift) but if Scotch installed locally the added `--with-scotch=/full/scotch/install/dir/path/` flag will need to be passed to `./configure` + + +There are also two beta-testing Scotch modules available at the time of this writing: +(as indicated by the dot before the version number, `.7.0.4`): +You can have the modules environment as +``` + module load cosma/2018 python/3.6.5 intel_comp/2022.1.2 compiler openmpi/4.1.4 fftw/3.3.9 parallel_hdf5/1.12.0 parmetis/4.0.3-64bit metis/5.1.0-64bit gsl/2.5' +``` +Then, +``` + module load scotch/.7.0.4-32bit +``` +or +``` + module load scotch/.7.0.4-64bit +``` +depending on what version of Scotch, 32-bit or 64-bit, you want to use. + +**Warning on compiler inclusion preferences** + +You can use the `modules` environment to load a suitable Scotch module. In `Cosma8` you can use `module load scotch/.7.0.4-32bit` or `module load scotch/.7.0.4-64bit` to get the 32- or the 64-bit compiled module for Scotch. At the moment of this writing, both 32 and 64 bit versions of Scotch can be loaded together, i.e. there is no exclusion check in the module files. There could be a case where both are needed, for testing or development purposes, for example. Then, care must be taken to avoid unintentional compile time behaviour: the order of the `-I/.../...` commandline switches that will find their way in the `Makefile`(s) depends on what the compiler used prefers in the order of inclusion -- so if you have loaded first the 32 bit module, its corresponding `-I` switch will be placed first, and if the 64 bit module is loaded afterwards in the environment, it will be placed after, etc. + +Also, in the case of using a locally-built Scotch package, it is advised that you _do not_ load any Scotch module in advance in the environment, and instead use `./configure` with the `--with-scotch=/full/scotch/install/dir/path/` in order to make sure that the `Makefile` will pick up the correct include and library files. + +Running with Scotch +---------------- + +Scotch carries out a _mapping_ of a _source_ (or process) graph onto a _target_ (or architecture) graph. The weighted _source_ graph is generated by SWIFT and it captures the computation and communication cost across the computational domain. The _target_ graph defines the communication cost across the available computing architecture. Therefore, to make use of the Scotch _mapping_ alogrithms a target architecture file (_target.tgt_) must be generated and it should mirror the set up of the cluster being used. Scotch provides optimised architecture files which capture most HPC set ups. For Cosma8 runs we will be targetting NUMA regions therefore we have modelled the architecture as a `tleaf` structure. + +In the following examples it is assumed that one mpi rank is mapped to each Cosma 8 NUMA region. This enforces that `cpus-per-task=16` is defined in the SLURM submission script. The Cosma 8 nodes consist of 8 NUMA regions per node, with 4 NUMA regions per socket. Example `tleaf` files for various setups are given below, where the intrasocket communication cost between NUMA regions is set at _5_, intranode but across sockets is set at _10_ and the internode cost is set at _1000_. These weightings are estimated values but have been shown to give satisfactory results in the testcases explored. An estimate of the relative latency between NUMA regions on a node can be obtained by using [hwloc](https://github.com/open-mpi/hwloc), specifically by using `hwloc-distances`. + +**Example tleaf graphs to represent various Cosma8 configurations** +| Number of nodes | Number of MPI ranks | tleaf | +| --------------- | ------------------- | ----------------------- | +| 1 | 2 | tleaf 1 2 5 | +| 1 | 8 | tleaf 2 2 10 4 5 | +| 2 | 16 | tleaf 3 2 1000 2 10 4 5 | +| 4 | 32 | tleaf 3 4 1000 2 10 4 5 | +| 8 | 64 | tleaf 3 8 1000 2 10 4 5 | + +The first index denotes the number of layers of the tleaf structure and the subsequent index pairs are the number of nodes in a layer and the cost to communicate between them. For example the numbers represented in the 64 MPI rank case (`tleaf 3 8 1000 2 10 4 5`) are as follows: reading left to right the `3` denotes the number of layers in the leaf structure, `8` denotes the number of vertices in the first layer (which corresponds to 8 compute nodes), `1000` the relative cost for internode communication, `2` denotes the number of vertices in the second layer (number of sockets on each node), `10` relative cost for intersocket communication, `4` denotes the number of vertices in the final layer (number of NUMA regions per socket on cosma8) and finally `5` is the relative cost of intrasocket communication. + +The user needs to define this tleaf structure and save it as `target.tgt` in the directory they will run SWIFT from. Ongoing work focuses on automatically generating this target architecture upon run time. + +With OpenMPI the `mpirun` option `--map-by numa` has been found to be optimal. This is in contrast to previously suggested `--bind-to none` on the cosma8 [site](https://www.dur.ac.uk/icc/cosma/support/cosma8/). + +Scotch details +---------------- + +Scotch carries out the mapping using various strategies which are outlined in the [documentation](https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/scotch_user7.0.pdf), a list of the strategies trialed include: `SCOTCH_STRATDEFAULT`, `SCOTCH_STRATBALANCE`, `SCOTCH_STRATQUALITY` and `SCOTCH_STRATSPEED`. The Scotch strategy string is passed to the Mapping functions. For the runs carried out here it was found that the global flag `SCOTCH_STRATBALANCE` and an imbalance ratio of `0.05` worked best. These values are passed to [`SCOTCH_stratGraphMapBuild`](https://github.com/UCL/swiftsim/blob/cb06b0e5c3d8457c474d0084d973f437d29b20d8/src/partition.c#L1657). + +One issue with Scotch is that when the number of mpi ranks is comparable to the dimensionality of the modelled SWIFT system the optimal mapping strategy doesn't neccessarily map to all available NUMA regions. At present this isn't handled explicity in the code and the paritition reverts to a vectorised or previous partitioning. + +The SWIFT edge and vertex weights are estimated in the code, however edge weights are not symmetric - this causes an issue with SWIFT. Therefore, in the SCOTCH Graph the edge weigths are updated to equal to the average value (sum/2) of the two associated edge weights as calculated from SWIFT. + +Test Runs +---------------- +The following results were obtained on cosma8 running with the `SCOTCH_STRATBALANCE` strategy string and an imbalance ratio of `0.05`. + +| Testcase | Resources | flags | Node types | Scotch32 (s) | Scotch64 (s) | Scotch local (s) | ParMetis (s) | Metis (s) | +| ---------- | --------------------------- | -------------- | ---------- | ------------ | ------------ | ---------------- | ------------ | --------- | +| EAGLE_006 | nodes = 1 (8 NUMA regions) | `--map_by numa` | Milan | 1191.8 | 1198.2 | 1173.6 | 1167.4 | 1176.4 | +| | -//- | -//- | Milan | 1176.7 | 1184.4 | 1193.8 | 1212.5 | 1182.1 | +| | -//- | -//- | Milan | 1174.5 | 1183.6 | 1175.2 | 1229.4 | 1180.7 | +| | -//- | -//- | Rome | 1368.8 | 1322.9 | 1351.5 | 1332.8 | 1334.9 | +| | -//- | -//- | Rome | 1378.3 | 1373.8 | 1353.4 | 1332.3 | 1346.8 | +| | -//- | -//- | Rome | 1367.3 | 1395.0 | 1361.0 | 1331.2 | 1330.8 | +| | | | | | | | | | +| | nodes = 2 (16 NUMA regions) | -//- | Milan | 1191.2 | 1225.8 | 1167.6 | 1154.0 | 1159.7 | +| | -//- | -//- | Milan | 1147.9 | 1185.0 | 1158.2 | 1168.3 | 1163.4 | +| | -//- | -//- | Milan | 1162.0 | 1180.4 | 1149.3 | 1147.7 | 1157.3 | +| | -//- | -//- | Rome | 1358.3 | 1538.3 | 1325.5 | 1338.8 | 1344.8 | +| | -//- | -//- | Rome | 1355.8 | 1519.3 | 1338.2 | 1390.1 | 1336.8 | +| | -//- | -//- | Rome | 1347.1 | 1395.0 | 1336.0 | 1345.4 | 1338.7 | +| | | | | | | | | | +| EAGLE_025 | nodes = 2 (16 NUMA regions) | `--map_by numa` | Milan | 7546.8 | 7450.0 | 7564.7 | 7202.0 | 7302.3 | +| | -//- | -//- | Milan | 7508.8 | 7490.4 | 7506.6 | 7416.3 | 7291.3 | +| | -//- | -//- | Milan | 7447.6 | 7516.5 | 7548.1 | 7093.1 | 7293.0 | +| | -//- | -//- | Rome | 8616.0 | 8660.5 | 8524.5 | 8810.5 | 8309.2 | +| | -//- | -//- | Rome | 8652.4 | 8492.7 | 8594.7 | 7955.9 | 8312.3 | +| | -//- | -//- | Rome | 8664.1 | 8565.2 | 8621.9 | 7946.7 | 8235.6 | +| | | | | | | | | | +| EAGLE_050 | nodes = 4 (32 NUMA regions) | `--map_by numa` | Milan | 45437.0 | 45714.8 | 45287.9 | 45110.4 | | +| | -//- | -//- | Milan | 45817.5 | 45128.4 | 45047.3 | 42131.7 | | +| | -//- | -//- | Milan | 45483.4 | 45213.9 | 45219.3 | 43263.8 | | +| | -//- | -//- | Rome | 51754.3 | 54724.4 | 51907.1 | 51315.1 | | +| | -//- | -//- | Rome | 51669.3 | 54213.8 | 51320.5 | 48338.0 | | +| | -//- | -//- | Rome | 51689.4 | 53387.0 | 51563.9 | 49702.8 | | +| | | | | | | | | | +| EAGLE_050 | nodes = 8 (64 NUMA regions) | `--map_by numa` | Milan | 36202.3 | 37158.3 | 36111.7 | 37006.0 | | +| | -//- | -//- | Milan | 36097.3 | 36503.1 | 36228.0 | 37344.7 | | +| | -//- | -//- | Milan | 36113.3 | 36155.7 | 36222.0 | 35488.4 | | +| | -//- | -//- | Rome | 42012.6 | 41790.7 | 41864.3 | 41723.6 | | +| | -//- | -//- | Rome | 41866.8 | 41517.0 | 41772.9 | 40533.6 | | +| | -//- | -//- | Rome | 43630.5 | 41419.9 | 41752.4 | 40628.3 | | + + + +Note: + +Cosma 8 cluster is currently comprised of: + + * 360 compute nodes with 1 TB RAM and dual 64-core AMD EPYC 7H12 water-cooled processors at 2.6GHz ("Rome"-type nodes) + * Upgraded with an additional 168 nodes with dual Milan 7763 processors at 2.45GHz ("Milan"-type nodes) + + + +Managing verbosity, logging info, dumping of intermediate results, debug info, restart files +--------------------------- +* If you pass the commandline switch `--enable-debugging-checks` to `./configure` while building, be prepared for additional logging and dump files. +* For large simulations, the amount of data dumps can easily overflow one's quota allowances; for example, the E_100 models can dump files on the order of `1TiBytes`, depending on runtime configuration, which can be specified in a `.yml` file. Please check the SWIFT docs. To keep dumping / logging to a local minimum, you can put the following sections / lines, for example, in a file `eagle_100.yml`: + + +``` +# Parameters governing the snapshots +Snapshots: + select_output_on: 1 + select_output: output_list.yml + basename: eagle # Common part of the name of output files + scale_factor_first: 0.91 # Scale-factor of the first snaphot (cosmological run) + time_first: 0.01 # Time of the first output (non-cosmological run) (in internal units) + delta_time: 1.01 # Time difference between consecutive outputs (in internal units) + recording_triggers_part: [1.0227e-4, 1.0227e-5] # Recording starts 100M and 10M years before a snapshot + recording_triggers_bpart: [1.0227e-4, 1.0227e-5] # Recording starts 100M and 10M years before a snapshot +``` + +and the contents of `output_list.yml` can be +``` +Default: + Standard_Gas: off + Standard_DM: off + Standard_DMBackground: off + Standard_Stars: off + Standard_BH: off +``` + +In the `eagle_100.yml` we can also specify that we do not want the `restart` dumps: +``` +Restarts: + enable: 0 +``` + + +Current Limitations +---------------- +1. As seen in the table above the current Scotch implementation is comparable in performance to the ParMETIS (METIS) implementation on problem sizes up to EAGLE_50. However, the current implementation is running into difficulties on the EAGLE_100 testcase. The Scotch partition in this case causes two separate errors: Memory overflow when running across 8 Cosma8 nodes and on 16 Cosma8 nodes the resultant Scotch partition results in certain ranks having greater than 64 MPI proxies which is a hard limit set within Swift. Ongoing work is focused on sorting out this issue. +2. The current implementation has only been tested against periodic domains. This is where each vertex in Swift has exactly 26 neighbours. Additions to the edge mean calculation in the `pick_scotch` function will need to be carried out to expand to non-periodic domains. + +Notes +---------------- + +1. Implementing the parallel version [PT-Scotch](https://inria.hal.science/hal-00402893) should improve performance across large node count runs. +2. Further improvement could be achieved by accurately obtaining the cost to communicate across the resources provided by the scheduler at runtime. The above approach using the pre generated `tleaf` file is an approximation and tools like [netloc](https://www.open-mpi.org/projects/netloc/), which derive from the network fabric representive latency values would be the optimal solution. To begin this would require admin access to run the set up commands to generate an overall graph of the whole HPC. This graph structure is then referenced on run time with the allocated nodes ids to build up an accurate reprensentation of the available compute. + + diff --git a/configure.ac b/configure.ac index 62c0b3a7b5..f71bae899c 100644 --- a/configure.ac +++ b/configure.ac @@ -791,6 +791,41 @@ AC_CHECK_LIB(pthread, posix_fallocate, AC_DEFINE([HAVE_POSIX_FALLOCATE], [1], [The posix library implements file allocation functions.]), AC_MSG_WARN(POSIX implementation does not have file allocation functions.)) +# Check for SCOTCH. +have_scotch="no" +AC_ARG_WITH([scotch], + [AS_HELP_STRING([--with-scotch=PATH], + [root directory where SCOTCH is installed @<:@yes/no@:>@] + )], + [with_scotch="$withval"], + [with_scotch="no"] +) + +SCOTCH_LIBS="" +if test "x$with_scotch" != "xno"; then + +# Check if we have SCOTCH. + if test "x$with_scotch" != "xyes" -a "x$with_scotch" != "x"; then + SCOTCH_LIBS="-L$with_scotch/lib -lscotch -lscotcherr" + SCOTCH_INCS="-I$with_scotch/include" + else + SCOTCH_LIBS="-lscotch -lscotcherr" + SCOTCH_INCS="" + fi + AC_CHECK_LIB([scotch],[SCOTCH_graphInit], [have_scotch="yes"], + [have_scotch="no"], $SCOTCH_LIBS) + if test "$have_scotch" = "yes"; then + AC_DEFINE([HAVE_SCOTCH],1,[The SCOTCH library is present.]) + else + AC_MSG_ERROR("Failed to find a SCOTCH library") + fi +fi + +AC_SUBST([SCOTCH_LIBS]) +AC_SUBST([SCOTCH_INCS]) +AM_CONDITIONAL([HAVESCOTCH],[test -n "$SCOTCH_LIBS"]) + + # Check for METIS. have_metis="no" AC_ARG_WITH([metis], @@ -3051,6 +3086,7 @@ AC_MSG_RESULT([ HDF5 enabled : $with_hdf5 - parallel : $have_parallel_hdf5 METIS/ParMETIS : $have_metis / $have_parmetis + Scotch : $have_scotch FFTW3 enabled : $have_fftw - threaded/openmp : $have_threaded_fftw / $have_openmp_fftw - MPI : $have_mpi_fftw diff --git a/src/Makefile.am b/src/Makefile.am index 92640cac8c..2c81160f32 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -28,8 +28,8 @@ GIT_CMD = @GIT_CMD@ EXTRA_LIBS = $(GSL_LIBS) $(HDF5_LIBS) $(FFTW_LIBS) $(NUMA_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) $(JEMALLOC_LIBS) $(TBBMALLOC_LIBS) $(GRACKLE_LIBS) $(SUNDIALS_LIBS) $(CHEALPIX_LIBS) # MPI libraries. -MPI_LIBS = $(PARMETIS_LIBS) $(METIS_LIBS) $(MPI_THREAD_LIBS) -MPI_FLAGS = -DWITH_MPI $(PARMETIS_INCS) $(METIS_INCS) +MPI_LIBS = $(SCOTCH_LIBS) $(PARMETIS_LIBS) $(METIS_LIBS) $(MPI_THREAD_LIBS) +MPI_FLAGS = -DWITH_MPI $(SCOTCH_INCS) $(PARMETIS_INCS) $(METIS_INCS) # Build the libswiftsim library and a convenience library just for the gravity tasks lib_LTLIBRARIES = libswiftsim.la diff --git a/src/debug.c b/src/debug.c index 978da76415..8b9456e278 100644 --- a/src/debug.c +++ b/src/debug.c @@ -637,7 +637,8 @@ void dumpCells(const char *prefix, int super, int active, int mpiactive, fclose(file); } -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) \ + && !defined(HAVE_SCOTCH) /** * @brief Dump a graph in METIS standard format, simple format and weights @@ -650,8 +651,7 @@ void dumpCells(const char *prefix, int super, int active, int mpiactive, * * The output filenames are generated from the prefix and the sequence number * of calls. So the first is called {prefix}_std_001.dat, - *{prefix}_simple_001.dat, - * {prefix}_weights_001.dat, etc. + * {prefix}_simple_001.dat, {prefix}_weights_001.dat, etc. * * @param prefix base output filename * @param nvertices the number of vertices diff --git a/src/debug.h b/src/debug.h index 4ec9dc8127..f2c93e5509 100644 --- a/src/debug.h +++ b/src/debug.h @@ -39,8 +39,9 @@ int checkCellhdxmax(const struct cell *c, int *depth); void dumpCells(const char *prefix, int super, int active, int mpiactive, int pactive, struct space *s, int rank, int step); -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) -#include "metis.h" +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) \ + && !defined(HAVE_SCOTCH) +#include void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj, idx_t *adjncy, idx_t *vwgt, idx_t *vsize, idx_t *adjwgt); #endif diff --git a/src/engine.c b/src/engine.c index b545fd66d1..1021178a25 100644 --- a/src/engine.c +++ b/src/engine.c @@ -182,7 +182,8 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) { */ void engine_repartition(struct engine *e) { -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_PARMETIS) || defined(HAVE_METIS) || defined(HAVE_SCOTCH)) ticks tic = getticks(); @@ -254,7 +255,7 @@ void engine_repartition(struct engine *e) { clocks_getunit()); #else if (e->reparttype->type != REPART_NONE) - error("SWIFT was not compiled with MPI and METIS or ParMETIS support."); + error("SWIFT was not compiled with MPI and METIS, ParMETIS or SCOTCH support."); /* Clear the repartition flag. */ e->forcerepart = 0; diff --git a/src/partition.c b/src/partition.c index 2030451dad..ac8b5c8854 100644 --- a/src/partition.c +++ b/src/partition.c @@ -24,7 +24,8 @@ * a grid of cells into geometrically connected regions and distributing * these around a number of MPI nodes. * - * Currently supported partitioning types: grid, vectorise and METIS/ParMETIS. + * Currently supported partitioning types: + * grid, vectorise, METIS/ParMETIS and SCOTCH. */ /* Config parameters. */ @@ -45,12 +46,20 @@ #ifdef WITH_MPI #include /* METIS/ParMETIS headers only used when MPI is also available. */ -#ifdef HAVE_PARMETIS +#if defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) #include #endif -#ifdef HAVE_METIS +#if defined(HAVE_METIS) && !defined(HAVE_SCOTCH) #include #endif +/* SCOTCH headers only used when MPI is also available. */ +#ifdef HAVE_SCOTCH +#include +#define idx_t SCOTCH_Idx +#define IDX_MAX (2147483647) +SCOTCH_Arch the_archdat; +SCOTCH_Arch *p_archdat = &the_archdat; +#endif #endif /* Local headers. */ @@ -74,7 +83,8 @@ const char *initial_partition_name[] = { const char *repartition_name[] = { "none", "edge and vertex task cost weights", "task cost edge weights", "memory balanced, using particle vertex weights", - "vertex task costs and edge delta timebin weights"}; + "vertex task costs and edge delta timebin weights", + "scotch mapping, edge and vertex cost weights"}; /* Local functions, if needed. */ static int check_complete(struct space *s, int verbose, int nregions); @@ -83,7 +93,8 @@ static int check_complete(struct space *s, int verbose, int nregions); * Repartition fixed costs per type/subtype. These are determined from the * statistics output produced when running with task debugging enabled. */ -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) static double repartition_costs[task_type_count][task_subtype_count]; #endif #if defined(WITH_MPI) @@ -180,7 +191,8 @@ static void split_vector(struct space *s, int nregions, int *samplecells) { * the cells next updates. */ -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) && !defined(HAVE_SCOTCH) + /** * @brief Fill the adjncy array defining the graph of cells in a space. * @@ -326,7 +338,8 @@ static void graph_init(struct space *s, int periodic, idx_t *weights_e, } #endif -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) struct counts_mapper_data { double *counts; size_t size; @@ -556,7 +569,8 @@ static void sizes_to_edges(struct space *s, double *counts, double *edges) { } #endif -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) /** * @brief Apply METIS cell-list partitioning to a cell structure. * @@ -569,12 +583,12 @@ static void split_metis(struct space *s, int nregions, int *celllist) { for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i]; /* To check or visualise the partition dump all the cells. */ - /*if (engine_rank == 0) dumpCellRanks("metis_partition", s->cells_top, - s->nr_cells);*/ + if (engine_rank == 0) dumpCellRanks("partition", s->cells_top, s->nr_cells); } #endif -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) /* qsort support. */ struct indexval { @@ -675,7 +689,7 @@ void permute_regions(int *newlist, int *oldlist, int nregions, int ncells, } #endif -#if defined(WITH_MPI) && defined(HAVE_PARMETIS) +#if defined(WITH_MPI) && defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) /** * @brief Partition the given space into a number of connected regions using * ParMETIS. @@ -1186,7 +1200,7 @@ static void pick_parmetis(int nodeID, struct space *s, int nregions, } #endif -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) && !defined(HAVE_SCOTCH) /** * @brief Partition the given space into a number of connected regions. * @@ -1349,7 +1363,338 @@ static void pick_metis(int nodeID, struct space *s, int nregions, } #endif -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && defined(HAVE_SCOTCH) +/** + * @brief Fill the adjncy array defining the graph of cells in a space. + * + * The format is identical to the METIS and ParMETIS periodic domain case. + * The cell graph consists of all nodes as vertices with edges as the connections + * to all neighbours, so we have 26 per vertex for periodic boundary. + * Note you will also need an xadj array, for SCOTCH that would be: + * + * xadj[0] = 0; + * for (int k = 0; k < s->nr_cells; k++) xadj[k + 1] = xadj[k] + 26; + * + * @param s the space of cells. + * @param periodic whether to assume a periodic space (fixed 26 edges). + * @param weights_e the edge weights for the cells, if used. On input + * assumed to be ordered with a fixed 26 edges per cell, so + * will need reordering for non-periodic spaces. + * @param adjncy the adjncy array to fill, must be of size 26 * the number of + * cells in the space. + * @param nadjcny number of adjncy elements used, can be less if not periodic. + * @param xadj the Scotch xadj array to fill, must be of size + * number of cells in space + 1. NULL for not used. + * @param nxadj the number of xadj element used. + */ +static void graph_init_scotch(struct space *s, int periodic, + SCOTCH_Num *weights_e, idx_t *adjncy, + int *nadjcny, SCOTCH_Num *xadj, int *nxadj) { + + /* Loop over all cells in the space. */ + *nadjcny = 0; + int cid = 0; + for (int l = 0; l < s->cdim[0]; l++) { + for (int m = 0; m < s->cdim[1]; m++) { + for (int n = 0; n < s->cdim[2]; n++) { + + /* Visit all neighbours of this cell, wrapping space at edges. */ + int p = 0; + for (int i = -1; i <= 1; i++) { + int ii = l + i; + if (ii < 0) + ii += s->cdim[0]; + else if (ii >= s->cdim[0]) + ii -= s->cdim[0]; + for (int j = -1; j <= 1; j++) { + int jj = m + j; + if (jj < 0) + jj += s->cdim[1]; + else if (jj >= s->cdim[1]) + jj -= s->cdim[1]; + for (int k = -1; k <= 1; k++) { + int kk = n + k; + if (kk < 0) + kk += s->cdim[2]; + else if (kk >= s->cdim[2]) + kk -= s->cdim[2]; + + /* If not self, record id of neighbour. */ + if (i || j || k) { + adjncy[cid * 26 + p] = cell_getid(s->cdim, ii, jj, kk); + p++; + } + } + } + } + + /* Next cell. */ + cid++; + } + } + *nadjcny = cid * 26; + + /* If given set Scotch xadj. */ + if (xadj != NULL) { + xadj[0] = 0; + for (int k = 0; k < s->nr_cells; k++) xadj[k + 1] = xadj[k] + 26; + *nxadj = s->nr_cells; + } + } +} + +/** + * @brief Partition the given space into a number of connected regions and + * map to available architecture. + * + * Split the space and map to compute architecture using Scotch. to derive + * a partitions using the given edge and vertex weights. If no weights + * are given then an unweighted partition is performed. + * + * @param nodeID the rank of our node. + * @param s the space of cells to partition. + * @param nregions the number of regions required in the partition. + * @param vertexw weights for the cells, sizeof number of cells if used, + * NULL for unit weights. Need to be in the range of idx_t. + * @param edgew weights for the graph edges between all cells, sizeof number + * of cells * 26 if used, NULL for unit weights. Need to be packed + * in CSR format, so same as adjncy array. Need to be in the range of + * idx_t. + * @param celllist on exit this contains the ids of the selected regions, + * sizeof number of cells. + */ +static void pick_scotch(int nodeID, struct space *s, int nregions, + double *vertexw, double *edgew, int *celllist, + SCOTCH_Arch *archdat) { + + /* Total number of cells. */ + int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2]; + + /* Nothing much to do if only using a single partition. */ + if (nregions == 1) { + for (int i = 0; i < ncells; i++) celllist[i] = 0; + return; + } + /* Only one node needs to calculate this. */ + if (nodeID == 0) { + /* Allocate adjacency and weights arrays . */ + SCOTCH_Num *xadj; + if ((xadj = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * (ncells + 1))) == + NULL) + error("Failed to allocate xadj buffer."); + idx_t *adjncy; + if ((adjncy = (idx_t *)malloc(sizeof(idx_t) * 26 * ncells)) == + NULL) + error("Failed to allocate adjncy array."); + SCOTCH_Num *weights_v = NULL; + if (vertexw != NULL) + if ((weights_v = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * ncells)) == + NULL) + error("Failed to allocate vertex weights array"); + SCOTCH_Num *weights_e = NULL; + if (edgew != NULL) + if ((weights_e = + (SCOTCH_Num *)malloc(26 * sizeof(SCOTCH_Num) * ncells)) == NULL) + error("Failed to allocate edge weights array"); + SCOTCH_Num *regionid; + if ((regionid = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * ncells)) == NULL) + error("Failed to allocate regionid array"); + + /* Init the vertex weights array. */ + if (vertexw != NULL) { + for (int k = 0; k < ncells; k++) { + if (vertexw[k] > 1) { + weights_v[k] = vertexw[k]; + } else { + weights_v[k] = 0; + } + } + +#ifdef SWIFT_DEBUG_CHECKS + /* Check weights are all in range. */ + int failed = 0; + for (int k = 0; k < ncells; k++) { + if ((SCOTCH_Num)vertexw[k] < 0) { + message("Input vertex weight out of range: %ld", (long)vertexw[k]); + failed++; + } + if (weights_v[k] < 0) { + message("Used vertex weight out of range: %ld", (long)weights_v[k]); + failed++; + } + } + if (failed > 0) error("%d vertex weights are out of range", failed); +#endif + } + + /* Init the edges weights array. */ + + if (edgew != NULL) { + for (int k = 0; k < 26 * ncells; k++) { + if (edgew[k] > 1) { + weights_e[k] = edgew[k]; + } else { + weights_e[k] = 1; + } + } + +#ifdef SWIFT_DEBUG_CHECKS + /* Check weights are all in range. */ + int failed = 0; + for (int k = 0; k < 26 * ncells; k++) { + + if ((SCOTCH_Num)edgew[k] < 0) { + message("Input edge weight out of range: %ld", (long)edgew[k]); + failed++; + } + if (weights_e[k] < 1) { + message("Used edge weight out of range: %" "I64d", (long long)weights_e[k]); + failed++; + } + } + if (failed > 0) error("%d edge weights are out of range", failed); +#endif + } + /* Define the cell graph. Keeping the edge weights association. */ + int nadjcny = 0; + int nxadj = 0; + graph_init_scotch(s, s->periodic, weights_e, adjncy, &nadjcny, xadj, + &nxadj); + /* Define the cell graph. Keeping the edge weights association. */ + /* Setting up the Scotch graph */ + SCOTCH_Graph graph; + SCOTCH_Num baseval = 0; + SCOTCH_Num vertnbr = ncells; /* Number of vertices */ + SCOTCH_Num edgenbr = (26 * vertnbr); /* Number of edges (arcs) */ + + SCOTCH_Num *verttab; /* Vertex array [vertnbr + 1] */ + if ((verttab = (SCOTCH_Num *)malloc((vertnbr + 1) * sizeof(SCOTCH_Num))) == NULL) { + error("Failed to allocate Vertex array"); + } + + SCOTCH_Num *velotab; /* Vertex load array */ + if ((velotab = (SCOTCH_Num *)malloc((vertnbr) * sizeof(SCOTCH_Num))) == NULL) { + error("Failed to allocate Vertex load array"); + } + + SCOTCH_Num *edgetab; /* Edge array [edgenbr] */ + if ((edgetab = (SCOTCH_Num *)malloc((edgenbr) * sizeof(SCOTCH_Num))) == NULL) { + error("Failed to allocate Edge array"); + } + + SCOTCH_Num *edlotab; /* Int load of each edge */ + if ((edlotab = (SCOTCH_Num *)malloc((edgenbr) * sizeof(SCOTCH_Num))) == NULL) { + error("Failed to allocate Edge Load array"); + } + + int edges_deg = 26; + for (int i = 0; i < vertnbr; i++) { + verttab[i] = i * edges_deg; + velotab[i] = weights_v[i]; + } + verttab[vertnbr] = vertnbr * edges_deg; + + int vertex_count = 0; + int neighbour; + int return_edge = 0; + /* The bidirectional weights associated with an edge are averaged to ensure + that the resultant edges are symmetric. This is a neccessary for a Scotch + graph. Note that the below assumes a periodic domain, where each vertex has + 26 neighbours */ + for (int i = 0; i < edgenbr; i++) { + if ((i > (edges_deg - 1)) && (i % edges_deg == 0)) { + vertex_count++; + } + neighbour = adjncy[i]; + edgetab[i] = neighbour; + for (int j = 0; j < edges_deg; j++) { + if ((adjncy[(neighbour * edges_deg + j)]) == vertex_count) { + return_edge = (neighbour * edges_deg + j); + } + } + edlotab[i] = (SCOTCH_Num)((weights_e[i] + weights_e[return_edge]) / 2.0); + } + + SCOTCH_graphInit(&graph); + + if (SCOTCH_graphBuild(&graph, baseval, vertnbr, verttab, NULL, velotab, + NULL, edgenbr, edgetab, edlotab) != 0) { + error("Error: Cannot build Scotch Graph.\n"); + } + #ifdef SWIFT_DEBUG_CHECKS + SCOTCH_graphCheck(&graph); + static int partition_count = 0; + char fname[200]; + sprintf(fname, "scotch_input_com_graph_%03d.grf", partition_count++); + FILE *graph_file = fopen(fname, "w"); + if (graph_file == NULL) { + printf("Error: Cannot open output file.\n"); + } + + if (SCOTCH_graphSave(&graph, graph_file) != 0) { + printf("Error: Cannot save Scotch Graph.\n"); + } + fclose(graph_file); + #endif + + /* Initialise in strategy. */ + SCOTCH_Strat stradat; + SCOTCH_stratInit(&stradat); + SCOTCH_Num num_vertices; + /* Choose between different strategies: + * e.g., SCOTCH_STRATQUALITY, SCOTCH_STRATBALANCE, etc. + * SCOTCH_STRATBALANCE seems to be the best choice. + */ + SCOTCH_Num flagval = SCOTCH_STRATBALANCE; + num_vertices = SCOTCH_archSize(archdat); + if (SCOTCH_stratGraphMapBuild(&stradat, flagval, num_vertices, 0.05) != 0) + error("Error setting the Scotch mapping strategy."); + + /* Map the computation graph to the architecture graph */ + if (SCOTCH_graphMap(&graph, archdat, &stradat, regionid) != 0) + error("Error Scotch mapping failed."); + #ifdef SWIFT_DEBUG_CHECKS + SCOTCH_Mapping mappptr; + SCOTCH_graphMapInit(&graph, &mappptr, archdat, regionid); + FILE *map_stats = fopen("map_stats.out", "w"); + SCOTCH_graphMapView(&graph, &mappptr, map_stats); + fclose(map_stats); + #endif + /* Check that the regionids are ok. */ + for (int k = 0; k < ncells; k++) { + if (regionid[k] < 0 || regionid[k] >= nregions) + error("Got bad nodeID for cell %i.", k); + /* And keep. */ + celllist[k] = regionid[k]; + } + SCOTCH_graphExit(&graph); + SCOTCH_stratExit(&stradat); + /* We will not be calling SCOTCH_archExit(archdat): + * this would destroy the contents of the archdat structure. + * The above two Scotch ...Exit() calls destroy localy defined + * structs, so they are OK to call. + */ + + if (verttab != NULL) free(verttab); + if (velotab != NULL) free(velotab); + if (edgetab != NULL) free(edgetab); + if (edlotab != NULL) free(edlotab); + /* Clean up. */ + if (weights_v != NULL) free(weights_v); + if (weights_e != NULL) free(weights_e); + free(xadj); + free(adjncy); + free(regionid); + } + + /* Calculations all done, now everyone gets a copy. */ + int res = MPI_Bcast(celllist, ncells, MPI_INT, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to broadcast new celllist"); +} +#endif + +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) /* Helper struct for partition_gather weights. */ struct weights_mapper_data { @@ -1532,6 +1877,9 @@ void partition_gather_weights(void *map_data, int num_elements, } } +#endif + +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) && !defined(HAVE_SCOTCH) /** * @brief Repartition the cells amongst the nodes using weights of * various kinds. @@ -1703,7 +2051,7 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, } /* And repartition/ partition, using both weights or not as requested. */ -#ifdef HAVE_PARMETIS +#if defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) if (repartition->usemetis) { pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, repartition->celllist); @@ -1712,7 +2060,7 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, repartition->adaptive, repartition->itr, repartition->celllist); } -#else +#elif !defined(HAVE_SCOTCH) pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, repartition->celllist); #endif @@ -1801,7 +2149,7 @@ static void repart_memory_metis(struct repartition *repartition, int nodeID, } /* And repartition. */ -#ifdef HAVE_PARMETIS +#if defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) if (repartition->usemetis) { pick_metis(nodeID, s, nr_nodes, weights, NULL, repartition->celllist); } else { @@ -1809,7 +2157,7 @@ static void repart_memory_metis(struct repartition *repartition, int nodeID, repartition->adaptive, repartition->itr, repartition->celllist); } -#else +#elif !defined(HAVE_SCOTCH) pick_metis(nodeID, s, nr_nodes, weights, NULL, repartition->celllist); #endif @@ -1846,7 +2194,219 @@ static void repart_memory_metis(struct repartition *repartition, int nodeID, /* And apply to our cells */ split_metis(s, nr_nodes, repartition->celllist); } -#endif /* WITH_MPI && (HAVE_METIS || HAVE_PARMETIS) */ +#endif /* WITH_MPI && HAVE_METIS || HAVE_PARMETIS && !defined(HAVE_SCOTCH) */ + +#if WITH_MPI && HAVE_SCOTCH +/** + * @brief Repartition the cells amongst the nodes using weights based on + * the memory use of particles in the cells. + * + * @param repartition the partition struct of the local engine. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells holding our local particles. + */ +static void repart_scotch(int vweights, int eweights, int timebins, + struct repartition *repartition, int nodeID, + int nr_nodes, struct space *s, struct task *tasks, + int nr_tasks) { + + /* Create weight arrays using task ticks for vertices and edges (edges + * * assume the same graph structure as used in the part_ calls). */ + int nr_cells = s->nr_cells; + struct cell *cells = s->cells_top; + + /* Allocate and fill the adjncy indexing array defining the graph of + * cells. + */ + idx_t *inds; + if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 * nr_cells)) == NULL) + error("Failed to allocate the inds array"); + int nadjcny = 0; + int nxadj = 0; + + graph_init_scotch(s, 1 /* periodic */, NULL /* no edge weights */, inds, + &nadjcny, NULL /* no xadj needed */, &nxadj); + + /* Allocate and init weights. */ + double *weights_v = NULL; + double *weights_e = NULL; + if (vweights) { + if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL) + error("Failed to allocate vertex weights arrays."); + bzero(weights_v, sizeof(double) * nr_cells); + } + if (eweights) { + if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL) + error("Failed to allocate edge weights arrays."); + bzero(weights_e, sizeof(double) * 26 * nr_cells); + } + + /* Gather weights. */ + struct weights_mapper_data weights_data; + + weights_data.cells = cells; + weights_data.eweights = eweights; + weights_data.inds = inds; + weights_data.nodeID = nodeID; + weights_data.nr_cells = nr_cells; + weights_data.timebins = timebins; + weights_data.vweights = vweights; + weights_data.weights_e = weights_e; + weights_data.weights_v = weights_v; + weights_data.use_ticks = repartition->use_ticks; + + ticks tic = getticks(); + + threadpool_map(&s->e->threadpool, partition_gather_weights, tasks, nr_tasks, + sizeof(struct task), threadpool_auto_chunk_size, + &weights_data); + if (s->e->verbose) + message("weight mapper took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); + +#ifdef SWIFT_DEBUG_CHECKS + check_weights(tasks, nr_tasks, &weights_data, weights_v, weights_e); +#endif + + /* Merge the weights arrays across all nodes. */ + int res; + if (vweights) { + res = MPI_Allreduce(MPI_IN_PLACE, weights_v, nr_cells, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD); + if (res != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce vertex weights."); + } + + if (eweights) { + res = MPI_Allreduce(MPI_IN_PLACE, weights_e, 26 * nr_cells, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to allreduce edge weights."); + } + + /* Allocate cell list for the partition. If not already done. */ + if (repartition->ncelllist != s->nr_cells) { + free(repartition->celllist); + repartition->ncelllist = 0; + if ((repartition->celllist = (int *)malloc(sizeof(int) * s->nr_cells)) == + NULL) + error("Failed to allocate celllist"); + repartition->ncelllist = s->nr_cells; + } + + /* We need to rescale the sum of the weights so that the sums of the two + * types of weights are less than IDX_MAX, that is the range of idx_t. + */ + double vsum = 0.0; + if (vweights) + for (int k = 0; k < nr_cells; k++) vsum += weights_v[k]; + double esum = 0.0; + if (eweights) + for (int k = 0; k < 26 * nr_cells; k++) esum += weights_e[k]; + + /* Do the scaling, if needed, keeping both weights in proportion. */ + double vscale = 1.0; + double escale = 1.0; + if (vweights && eweights) { + if (vsum > esum) { + if (vsum > (double)IDX_MAX) { + vscale = (double)(IDX_MAX - 10000) / vsum; + escale = vscale; + } + } else { + if (esum > (double)IDX_MAX) { + escale = (double)(IDX_MAX - 10000) / esum; + vscale = escale; + } + } + } else if (vweights) { + if (vsum > (double)IDX_MAX) { + vscale = (double)(IDX_MAX - 10000) / vsum; + } + } else if (eweights) { + if (esum > (double)IDX_MAX) { + escale = (double)(IDX_MAX - 10000) / esum; + } + } + + if (vweights && vscale != 1.0) { + vsum = 0.0; + for (int k = 0; k < nr_cells; k++) { + weights_v[k] *= vscale; + vsum += weights_v[k]; + } + vscale = 1.0; + } + if (eweights && escale != 1.0) { + esum = 0.0; + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] *= escale; + esum += weights_e[k]; + } + escale = 1.0; + } + + /* Balance edges and vertices when the edge weights are timebins, as these + * have no reason to have equivalent scales, we use an equipartition. + */ + if (timebins && eweights) { + + /* Make sums the same. */ + if (vsum > esum) { + escale = vsum / esum; + for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= escale; + } else { + vscale = esum / vsum; + for (int k = 0; k < nr_cells; k++) weights_v[k] *= vscale; + } + } + + /* And repartition/ partition, using both weights or not as requested. */ +#ifdef HAVE_SCOTCH + pick_scotch(nodeID, s, nr_nodes, weights_v, weights_e, repartition->celllist, p_archdat); +#endif + /* Check that all cells have good values. All nodes have same copy, so just + * * check on one. */ + if (nodeID == 0) { + for (int k = 0; k < nr_cells; k++) + if (repartition->celllist[k] < 0 || repartition->celllist[k] >= nr_nodes) + error("Got bad nodeID %d for cell %i.", repartition->celllist[k], k); + } + + /* Check that the partition is complete and all nodes have some work. */ + int present[nr_nodes]; + int failed = 0; + for (int i = 0; i < nr_nodes; i++) present[i] = 0; + for (int i = 0; i < nr_cells; i++) present[repartition->celllist[i]]++; + for (int i = 0; i < nr_nodes; i++) { + if (!present[i]) { + failed = 1; + if (nodeID == 0) message("Node %d is not present after repartition", i); + } + } + + /* If partition failed continue with the current one, but make this clear. */ + if (failed) { + if (nodeID == 0) + message( + "WARNING: SCOTCH repartition has failed, continuing with the current" + " partition, load balance will not be optimal"); + for (int k = 0; k < nr_cells; k++) + repartition->celllist[k] = cells[k].nodeID; + } else { + if (nodeID == 0) + message("SCOTCH repartition successful."); + } + + /* And apply to our cells */ + split_metis(s, nr_nodes, repartition->celllist); + + /* Clean up. */ + free(inds); + if (vweights) free(weights_v); + if (eweights) free(weights_e); +} +#endif /* WITH_MPI && HAVE_SCOTCH */ /** * @brief Repartition the space using the given repartition type. @@ -1865,8 +2425,7 @@ void partition_repartition(struct repartition *reparttype, int nodeID, int nr_nodes, struct space *s, struct task *tasks, int nr_tasks) { -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) - +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) && !defined(HAVE_SCOTCH) ticks tic = getticks(); if (reparttype->type == REPART_METIS_VERTEX_EDGE_COSTS) { @@ -1894,11 +2453,27 @@ void partition_repartition(struct repartition *reparttype, int nodeID, if (s->e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); +#elif defined(WITH_MPI) && defined(HAVE_SCOTCH) + ticks tic = getticks(); + + if (reparttype->type == REPART_SCOTCH) { + repart_scotch(1, 1, 0, reparttype, nodeID, nr_nodes, s, tasks, nr_tasks); + + } else if (reparttype->type == REPART_NONE) { + /* Doing nothing. */ + + } else { + error("Impossible repartition type"); + } + + if (s->e->verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); + #else - error("SWIFT was not compiled with METIS or ParMETIS support."); + error("SWIFT was not compiled with METIS, ParMETIS or Scotch support."); #endif } - /** * @brief Initial partition of space cells. * @@ -1948,17 +2523,18 @@ void partition_initial_partition(struct partition *initial_partition, return; } - } else if (initial_partition->type == INITPART_METIS_WEIGHT || - initial_partition->type == INITPART_METIS_WEIGHT_EDGE || - initial_partition->type == INITPART_METIS_NOWEIGHT) { -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) + } else if (initial_partition->type == INITPART_WEIGHT || + initial_partition->type == INITPART_WEIGHT_EDGE || + initial_partition->type == INITPART_NOWEIGHT) { +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) /* Simple k-way partition selected by METIS using cell particle * counts as weights or not. Should be best when starting with a * inhomogeneous dist. */ double *weights_v = NULL; double *weights_e = NULL; - if (initial_partition->type == INITPART_METIS_WEIGHT) { + if (initial_partition->type == INITPART_WEIGHT) { /* Particles sizes per cell, which will be used as weights. */ if ((weights_v = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) error("Failed to allocate weights_v buffer."); @@ -1966,7 +2542,7 @@ void partition_initial_partition(struct partition *initial_partition, /* Check each particle and accumulate the sizes per cell. */ accumulate_sizes(s, s->e->verbose, weights_v); - } else if (initial_partition->type == INITPART_METIS_WEIGHT_EDGE) { + } else if (initial_partition->type == INITPART_WEIGHT_EDGE) { /* Particle sizes also counted towards the edges. */ @@ -1987,7 +2563,16 @@ void partition_initial_partition(struct partition *initial_partition, int *celllist = NULL; if ((celllist = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) error("Failed to allocate celllist"); -#ifdef HAVE_PARMETIS +#ifdef HAVE_SCOTCH + FILE *arch_file = fopen(initial_partition->target_arch_file, "r"); + if (arch_file == NULL) + error("Error: Cannot open topo file."); + /* Load the architecture graph in .tgt format */ + if (SCOTCH_archLoad(p_archdat, arch_file) != 0) + error("Error loading architecture graph"); + fclose(arch_file); + pick_scotch(nodeID, s, nr_nodes, weights_v, weights_e, celllist, p_archdat); +#elif HAVE_PARMETIS if (initial_partition->usemetis) { pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, celllist); } else { @@ -1997,7 +2582,6 @@ void partition_initial_partition(struct partition *initial_partition, #else pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, celllist); #endif - /* And apply to our cells */ split_metis(s, nr_nodes, celllist); @@ -2005,7 +2589,11 @@ void partition_initial_partition(struct partition *initial_partition, * proceeding. */ if (!check_complete(s, (nodeID == 0), nr_nodes)) { if (nodeID == 0) +#ifdef HAVE_SCOTCH + message("SCOTCH initial partition failed, using a vectorised partition"); +#else message("METIS initial partition failed, using a vectorised partition"); +#endif initial_partition->type = INITPART_VECTORIZE; partition_initial_partition(initial_partition, nodeID, nr_nodes, s); } @@ -2064,9 +2652,12 @@ void partition_init(struct partition *partition, #ifdef WITH_MPI /* Defaults make use of METIS if available */ -#if defined(HAVE_METIS) || defined(HAVE_PARMETIS) +#if ((defined(HAVE_METIS) || defined(HAVE_PARMETIS)) && !defined(HAVE_SCOTCH)) const char *default_repart = "fullcosts"; const char *default_part = "edgememory"; +#elif defined(HAVE_SCOTCH) + const char *default_repart = "scotch"; + const char *default_part = "edgememory"; #else const char *default_repart = "none"; const char *default_part = "grid"; @@ -2090,15 +2681,15 @@ void partition_init(struct partition *partition, case 'v': partition->type = INITPART_VECTORIZE; break; -#if defined(HAVE_METIS) || defined(HAVE_PARMETIS) +#if defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH) case 'r': - partition->type = INITPART_METIS_NOWEIGHT; + partition->type = INITPART_NOWEIGHT; break; case 'm': - partition->type = INITPART_METIS_WEIGHT; + partition->type = INITPART_WEIGHT; break; case 'e': - partition->type = INITPART_METIS_WEIGHT_EDGE; + partition->type = INITPART_WEIGHT_EDGE; break; default: message("Invalid choice of initial partition type '%s'.", part_type); @@ -2127,7 +2718,7 @@ void partition_init(struct partition *partition, if (strcmp("none", part_type) == 0) { repartition->type = REPART_NONE; -#if defined(HAVE_METIS) || defined(HAVE_PARMETIS) +#if defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH) } else if (strcmp("fullcosts", part_type) == 0) { repartition->type = REPART_METIS_VERTEX_EDGE_COSTS; @@ -2140,6 +2731,9 @@ void partition_init(struct partition *partition, } else if (strcmp("timecosts", part_type) == 0) { repartition->type = REPART_METIS_VERTEX_COSTS_TIMEBINS; + } else if (strcmp("scotch", part_type) == 0) { + repartition->type = REPART_SCOTCH; + } else { message("Invalid choice of re-partition type '%s'.", part_type); error( @@ -2152,6 +2746,7 @@ void partition_init(struct partition *partition, "Permitted values are: 'none' when compiled without " "METIS or ParMETIS."); #endif + message("Choice of re-partition type '%s'.", part_type); } /* Get the fraction CPU time difference between nodes (<1) or the number @@ -2231,7 +2826,8 @@ void partition_init(struct partition *partition, */ static int repart_init_fixed_costs(void) { -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) /* Set the default fixed cost. */ for (int j = 0; j < task_type_count; j++) { for (int k = 0; k < task_subtype_count; k++) { @@ -2284,7 +2880,8 @@ static int check_complete(struct space *s, int verbose, int nregions) { return (!failed); } -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +#if defined(WITH_MPI) && \ + (defined(HAVE_METIS) || defined(HAVE_PARMETIS) || defined(HAVE_SCOTCH)) #ifdef SWIFT_DEBUG_CHECKS /** * @brief Check that the threadpool version of the weights construction is diff --git a/src/partition.h b/src/partition.h index 8f6dbbd148..5a7d69cc84 100644 --- a/src/partition.h +++ b/src/partition.h @@ -27,9 +27,9 @@ enum partition_type { INITPART_GRID = 0, INITPART_VECTORIZE, - INITPART_METIS_WEIGHT, - INITPART_METIS_NOWEIGHT, - INITPART_METIS_WEIGHT_EDGE + INITPART_WEIGHT, + INITPART_NOWEIGHT, + INITPART_WEIGHT_EDGE }; /* Simple descriptions of types for reports. */ @@ -40,6 +40,7 @@ struct partition { enum partition_type type; int grid[3]; int usemetis; + char target_arch_file[PARSER_MAX_LINE_SIZE]; }; /* Repartition type to use. */ @@ -48,7 +49,8 @@ enum repartition_type { REPART_METIS_VERTEX_EDGE_COSTS, REPART_METIS_EDGE_COSTS, REPART_METIS_VERTEX_COUNTS, - REPART_METIS_VERTEX_COSTS_TIMEBINS + REPART_METIS_VERTEX_COSTS_TIMEBINS, + REPART_SCOTCH }; /* Repartition preferences. */ diff --git a/src/space_regrid.c b/src/space_regrid.c index 95fa4d9cd9..18e15c524b 100644 --- a/src/space_regrid.c +++ b/src/space_regrid.c @@ -351,7 +351,7 @@ void space_regrid(struct space *s, int verbose) { message("Failed to get a new partition, trying less optimal method"); struct partition initial_partition; #if defined(HAVE_PARMETIS) || defined(HAVE_METIS) - initial_partition.type = INITPART_METIS_NOWEIGHT; + initial_partition.type = INITPART_NOWEIGHT; #else initial_partition.type = INITPART_VECTORIZE; #endif diff --git a/src/version.c b/src/version.c index 9d8642573d..0a8bc630e9 100644 --- a/src/version.c +++ b/src/version.c @@ -24,10 +24,11 @@ /* MPI headers. */ #ifdef WITH_MPI #include -#ifdef HAVE_METIS +#if defined(HAVE_METIS) && !defined(HAVE_SCOTCH) #include #endif -#ifdef HAVE_PARMETIS +#if defined(HAVE_PARMETIS) +#include #include #endif #endif @@ -318,7 +319,7 @@ const char *hdf5_version(void) { const char *metis_version(void) { static char version[256] = {0}; -#if defined(WITH_MPI) && defined(HAVE_METIS) +#if defined(WITH_MPI) && defined(HAVE_METIS) && !defined(HAVE_SCOTCH) sprintf(version, "%i.%i.%i", METIS_VER_MAJOR, METIS_VER_MINOR, METIS_VER_SUBMINOR); #else @@ -335,7 +336,7 @@ const char *metis_version(void) { const char *parmetis_version(void) { static char version[256] = {0}; -#if defined(WITH_MPI) && defined(HAVE_PARMETIS) +#if defined(WITH_MPI) && defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) sprintf(version, "%i.%i.%i", PARMETIS_MAJOR_VERSION, PARMETIS_MINOR_VERSION, PARMETIS_SUBMINOR_VERSION); #else diff --git a/swift.c b/swift.c index 4507ac055c..db46df6712 100644 --- a/swift.c +++ b/swift.c @@ -144,7 +144,7 @@ int main(int argc, char *argv[]) { MPI_SUCCESS) error("Call to MPI_Comm_set_errhandler failed with error %i.", res); if (myrank == 0) - pretime_message("MPI is up and running with %i node(s).\n", nr_nodes); + pretime_message("MPI is up and running with %i rank(s).\n", nr_nodes); if (nr_nodes == 1) { pretime_message("WARNING: you are running with one MPI rank."); pretime_message( @@ -205,6 +205,7 @@ int main(int argc, char *argv[]) { char *output_parameters_filename = NULL; char *cpufreqarg = NULL; char *param_filename = NULL; + char *scotch_tgtfile = NULL; char restart_file[200] = ""; unsigned long long cpufreq = 0; float dump_tasks_threshold = 0.f; @@ -360,6 +361,10 @@ int main(int argc, char *argv[]) { "Fraction of the total step's time spent in a task to trigger " "a dump of the task plot on this step", NULL, 0, 0), + OPT_STRING('o', "scotch-target-file", &scotch_tgtfile, + "Target file of the architecture which is needed to carry " + "out Scotch mappings", + NULL, 0, 0), OPT_END(), }; struct argparse argparse; @@ -413,7 +418,6 @@ int main(int argc, char *argv[]) { /* Deal with thread numbers */ if (nr_pool_threads == -1) nr_pool_threads = nr_threads; - /* Write output parameter file */ if (myrank == 0 && output_parameters_filename != NULL) { io_write_output_field_parameter(output_parameters_filename, with_cosmology, with_fof, with_structure_finding); @@ -871,19 +875,34 @@ int main(int argc, char *argv[]) { struct repartition reparttype; #ifdef WITH_MPI struct partition initial_partition; +#if defined(HAVE_SCOTCH) + /* need to provide arch file name before partition_init() is called */ + if (scotch_tgtfile != NULL){ + strcpy(initial_partition.target_arch_file, scotch_tgtfile); + } else { + error("No Scotch target architecture file provided."); + } +#endif partition_init(&initial_partition, &reparttype, params, nr_nodes); /* Let's report what we did */ if (myrank == 0) { -#if defined(HAVE_PARMETIS) +#if defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) if (reparttype.usemetis) message("Using METIS serial partitioning:"); else message("Using ParMETIS partitioning:"); -#elif defined(HAVE_METIS) +#elif defined(HAVE_METIS) && !defined(HAVE_SCOTCH) message("Using METIS serial partitioning:"); +#elif defined(HAVE_SCOTCH) + message("Using Scotch serial mapping:"); + if (scotch_tgtfile != NULL){ + message("Using the Scotch Target file: %s", scotch_tgtfile); + } else { /* extra failsafe check */ + error("Scotch mapping will fail: no target architecture file provided."); + } #else - message("Non-METIS partitioning:"); + message("Non-METIS and Non-SCOTCH partitioning:"); #endif message(" initial partitioning: %s", initial_partition_name[initial_partition.type]); diff --git a/swift_fof.c b/swift_fof.c index af7aefd784..213c42facd 100644 --- a/swift_fof.c +++ b/swift_fof.c @@ -154,6 +154,7 @@ int main(int argc, char *argv[]) { char *output_parameters_filename = NULL; char *cpufreqarg = NULL; char *param_filename = NULL; + char *scotch_tgtfile = NULL; unsigned long long cpufreq = 0; struct cmdparams cmdps; cmdps.nparam = 0; @@ -203,6 +204,10 @@ int main(int argc, char *argv[]) { OPT_INTEGER('Y', "threadpool-dumps", &dump_threadpool, "Time-step frequency at which threadpool tasks are dumped.", NULL, 0, 0), + OPT_STRING('o', "scotch-target-file", &scotch_tgtfile, + "Target file of the architecture which is needed to carry " + "out Scotch mappings", + NULL, 0, 0), OPT_END(), }; struct argparse argparse; @@ -376,19 +381,34 @@ int main(int argc, char *argv[]) { struct repartition reparttype; #ifdef WITH_MPI struct partition initial_partition; +#if defined(HAVE_SCOTCH) + /* need to provide arch file name before partition_init() is called */ + if (scotch_tgtfile != NULL){ + strcpy(initial_partition.target_arch_file, scotch_tgtfile); + } else { + error("No Scotch target architecture file provided."); + } +#endif partition_init(&initial_partition, &reparttype, params, nr_nodes); /* Let's report what we did */ if (myrank == 0) { -#if defined(HAVE_PARMETIS) +#if defined(HAVE_PARMETIS) && !defined(HAVE_SCOTCH) if (reparttype.usemetis) message("Using METIS serial partitioning:"); else message("Using ParMETIS partitioning:"); -#elif defined(HAVE_METIS) +#elif defined(HAVE_METIS) && !defined(HAVE_SCOTCH) message("Using METIS serial partitioning:"); +#elif defined(HAVE_SCOTCH) + message("Using Scotch serial mapping:"); + if (scotch_tgtfile != NULL){ + message("Using the Scotch Target file: %s", scotch_tgtfile); + } else { /* extra failsafe check */ + error("Scotch mapping will fail: no target architecture file provided."); + } #else - message("Non-METIS partitioning:"); + message("Non-METIS and Non-SCOTCH partitioning:"); #endif message(" initial partitioning: %s", initial_partition_name[initial_partition.type]); diff --git a/target_files/1_cosma_node.tgt b/target_files/1_cosma_node.tgt new file mode 100644 index 0000000000..d82fa65cd0 --- /dev/null +++ b/target_files/1_cosma_node.tgt @@ -0,0 +1 @@ +tleaf 2 2 100 4 10 \ No newline at end of file diff --git a/target_files/2_cosma_nodes.tgt b/target_files/2_cosma_nodes.tgt new file mode 100644 index 0000000000..08bc0a3ccc --- /dev/null +++ b/target_files/2_cosma_nodes.tgt @@ -0,0 +1 @@ +tleaf 3 2 1000 2 100 4 10 \ No newline at end of file diff --git a/target_files/4_cosma_nodes.tgt b/target_files/4_cosma_nodes.tgt new file mode 100644 index 0000000000..2fb58c5047 --- /dev/null +++ b/target_files/4_cosma_nodes.tgt @@ -0,0 +1 @@ +tleaf 3 4 1000 2 100 4 10 \ No newline at end of file