From d555e1b2a956c095f6e1f00b237993d081694dc6 Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Sat, 16 Dec 2023 00:39:23 +0100
Subject: [PATCH 1/9] General benchmarking framework [WIP]

---
 Project.toml                    |   3 +
 benchmark/launch_bernchmarks.sh | 102 ++++++++++++++++++++++++++++++++
 benchmark/tgv.jl                |  39 ++++++++++++
 benchmark/util.jl               |  38 ++++++++++++
 src/WaterLily.jl                |   8 +--
 5 files changed, 186 insertions(+), 4 deletions(-)
 create mode 100644 benchmark/launch_bernchmarks.sh
 create mode 100644 benchmark/tgv.jl
 create mode 100644 benchmark/util.jl

diff --git a/Project.toml b/Project.toml
index 1eecafcf..6d754337 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,11 +5,14 @@ version = "1.0.3"
 
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OutMacro = "0ae4d431-9932-4135-a8f1-51ee5e017775"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/benchmark/launch_bernchmarks.sh b/benchmark/launch_bernchmarks.sh
new file mode 100644
index 00000000..3dd85793
--- /dev/null
+++ b/benchmark/launch_bernchmarks.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Update project environment with new Julia version
+update_environment () {
+  echo "Updating environment to Julia v$version"
+  # juliaup default $version
+  julia --project -e "using Pkg; Pkg.update(); Pkg.precompile()"
+}
+
+display_info () {
+    echo "--------------------------------------"
+    echo "Running benchmark tests for:
+ - Julia:       ${VERSIONS[@]}
+ - Backends:    ${BACKENDS[@]}"
+    if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
+        echo " - CPU threads: ${THREADS[@]}"
+    fi
+    echo " - Cases:       ${CASES[@]}
+ - Size:        ${LOG2N[@]}
+ - Sim. time:   ${TEND[@]}
+ - Max. steps:  ${MAXSTEPS[@]}"
+    echo "--------------------------------------"; echo
+}
+
+# Defaults
+# VERSIONS=('1.8.5'  '1.10.0-rc2')
+VERSIONS=('1.8.5')
+BACKENDS=('Array' 'CuArray')
+THREADS=('1' '6')
+CASES=('tgv.jl' 'donut.jl')
+LOG2N=('(4,5,6)' '(7,8,9)')
+TEND=('10.0' '10.0')
+MAXSTEPS=('100' '100')
+
+# Parse arguments
+while [ $# -gt 0 ]; do
+case "$1" in
+    --versions|-v)
+    VERSIONS=($2)
+    shift
+    ;;
+    --backends|-b)
+    BACKENDS=($2)
+    shift
+    ;;
+    --threads|-t)
+    THREADS=($2)
+    shift
+    ;;
+    --cases|-c)
+    CASES=($2)
+    shift
+    ;;
+    --log2n|-log2n)
+    LOG2N=($2)
+    shift
+    ;;
+    --t_end|-tend)
+    MAXSTEPS=($2)
+    shift
+    ;;
+    --max_steps|-ms)
+    MAXSTEPS=($2)
+    shift
+    ;;
+    *)
+    printf "ERROR: Invalid argument\n"
+    exit 1
+esac
+shift
+done
+
+# Assert "Array" backend is present if "--threads" argument is passed
+if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
+    if [ "${#THREADS[@]}" == 0 ]; then
+        echo "ERROR: Backend 'Array' present, '--threads' argument is empty."
+        exit 1
+    fi
+fi
+
+# Display information
+display_info
+
+# Benchmarks
+for version in "${VERSIONS[@]}" ; do
+    echo "Julia v$version benchmaks"
+    for backend in "${BACKENDS[@]}" ; do
+        if [ "${backend}" == "Array" ]; then
+            for thread in "${THREADS[@]}" ; do
+                args="-t $thread "
+            done
+        else
+            echo "Backend is not Array"
+        fi
+    done
+    # update_environment
+    for case in "${CASES[@]}" ; do
+        for log2n in "${LOG2N[@]}" ; do
+
+        done
+    done
+done
\ No newline at end of file
diff --git a/benchmark/tgv.jl b/benchmark/tgv.jl
new file mode 100644
index 00000000..2a867c9e
--- /dev/null
+++ b/benchmark/tgv.jl
@@ -0,0 +1,39 @@
+using WaterLily
+using BenchmarkTools
+using CUDA: CuArray
+using KernelAbstractions: synchronize, get_backend
+using JLD2
+using OutMacro
+
+include("util.jl")
+
+log2n, t_end, max_steps, dtype, backend, samples = parse_cla(ARGS;
+    log2n=(5,6,7,8), t_end=1.0, max_steps=10, dtype=Float32, backend=Array, samples=5)
+evals = 5
+verbose = true
+
+function TGV(p, backend; Re=1e5, T=Float32)
+    # Define vortex size, velocity, viscosity
+    L = 2^p; U = 1; ν = U*L/Re
+    # Taylor-Green-Vortex initial velocity field
+    function uλ(i,vx)
+        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
+        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
+        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
+        return 0.                              # u_z
+    end
+    # Initialize simulation
+    return Simulation((L,L,L), (0,0,0), L; U=U, uλ=uλ, ν=ν, T=T, mem=backend)
+end
+
+function benchmark()
+    suite, results = BenchmarkGroup(), BenchmarkGroup()
+    sim_step!(TGV(log2n[1], backend; T=dtype), t_end; max_steps=1, verbose=true, remeasure=false) # warm up
+    add_to_suite!(suite, TGV; log2n=log2n, t_end=t_end, max_steps=max_steps, dtype=dtype, backend=backend) # create benchmark
+    # tune!(suite)
+    results[backend_str[backend]] = run(suite[backend_str[backend]], samples=samples, evals=evals, seconds=1e6, verbose=verbose) # run!
+    fname = string(@__DIR__)*"/tgv_simstep_p$(log2n...)_$(backend_str[backend])_v$VERSION.dat"
+    save_object(fname, results) # save benchmark
+end
+
+benchmark()
\ No newline at end of file
diff --git a/benchmark/util.jl b/benchmark/util.jl
new file mode 100644
index 00000000..bcb8fa5c
--- /dev/null
+++ b/benchmark/util.jl
@@ -0,0 +1,38 @@
+function parse_cla(args; log2n=(2,3,4), t_end=1.0, max_steps=10, dtype=Float32, backend=Array, samples=1)
+    iarg(arg) = occursin.(arg, args) |> findfirst
+    parse_tuple(T, s) = Tuple(parse.(T, split(strip(s, ['(', ')', ' ']), ','; keepempty=false)))
+    arg_value(arg) = split(args[iarg(arg)], "=")[end]
+
+    log2n = !isnothing(iarg("log2n")) ? arg_value("log2n") |> x -> parse_tuple(Int, x) : log2n
+    t_end = !isnothing(iarg("t_end")) ? arg_value("t_end") |> x -> parse(Float64, x) : t_end
+    max_steps = !isnothing(iarg("max_steps")) ? arg_value("max_steps") |> x -> parse(Int, x) : max_steps
+    dtype = !isnothing(iarg("dtype")) ? arg_value("dtype") |> x -> eval(Symbol(x)) : dtype
+    backend = !isnothing(iarg("backend")) ? arg_value("backend") |> x -> eval(Symbol(x)) : backend
+    samples = !isnothing(iarg("sampels")) ? arg_value("samples") |> x -> parse(Int, x) : samples
+    return log2n, t_end, max_steps, dtype, backend, samples
+end
+
+macro add_benchmark(args...)
+    ex, b, suite, label = args
+    return quote
+        $suite[$label] = @benchmarkable begin
+            $ex
+            synchronize($b)
+        end
+    end |> esc
+end
+
+backend_str = Dict(Array => "CPUx$(Threads.nthreads())", CuArray => "GPU")
+
+function add_to_suite!(suite, sim_function; log2n=(3,4,5), t_end=t_end, max_steps=max_steps, dtype=Float32, backend=Array)
+    bstr = backend_str[backend]
+    suite[bstr] = BenchmarkGroup([bstr])
+    for n in log2n
+        sim = sim_function(n, backend; T=dtype)
+        suite[bstr][repr(n)] = BenchmarkGroup([repr(n)])
+        @add_benchmark sim_step!($sim, $t_end; max_steps=$max_steps, verbose=true, remeasure=false) $(get_backend(sim.flow.p)) suite[bstr][repr(n)] "sim_step!"
+    end
+end
+
+git_hash() = read(`git rev-parse --short HEAD`, String) |> x -> strip(x, '\n')
+
diff --git a/src/WaterLily.jl b/src/WaterLily.jl
index 213d9046..f5598782 100644
--- a/src/WaterLily.jl
+++ b/src/WaterLily.jl
@@ -74,15 +74,15 @@ scales.
 sim_time(sim::Simulation) = time(sim)*sim.U/sim.L
 
 """
-    sim_step!(sim::Simulation,t_end;remeasure=true,verbose=false)
+    sim_step!(sim::Simulation,t_end;max_steps=typemax(Int),remeasure=true,verbose=false)
 
 Integrate the simulation `sim` up to dimensionless time `t_end`.
-If `remeasure=true`, the body is remeasured at every time step. 
+If `remeasure=true`, the body is remeasured at every time step.
 Can be set to `false` for static geometries to speed up simulation.
 """
-function sim_step!(sim::Simulation,t_end;verbose=false,remeasure=true)
+function sim_step!(sim::Simulation,t_end;max_steps=typemax(Int),verbose=false,remeasure=true)
     t = time(sim)
-    while t < t_end*sim.L/sim.U
+    while t < t_end*sim.L/sim.U && length(sim.flow.Δt) <= max_steps
         remeasure && measure!(sim,t)
         mom_step!(sim.flow,sim.pois) # evolve Flow
         t += sim.flow.Δt[end]

From 564937bbe28f8b3459db0de182b165b926b8f1aa Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Sat, 16 Dec 2023 13:02:05 +0100
Subject: [PATCH 2/9] Workflow running as intended, tested for multiple Julia
 versions, case configuration, backends. Reverted the dependencies added in
 WaterLily, and these are now included in the benchmarks environment. To run
 the benchmarks, the benchmarks environment is activated, while WaterLily is
 marked as a development package.

---
 Project.toml                    |  3 --
 benchmark/Project.toml          |  7 +++
 benchmark/launch_bernchmarks.sh | 94 ++++++++++++++++++++++-----------
 3 files changed, 71 insertions(+), 33 deletions(-)
 create mode 100644 benchmark/Project.toml

diff --git a/Project.toml b/Project.toml
index 6d754337..1eecafcf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,14 +5,11 @@ version = "1.0.3"
 
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-OutMacro = "0ae4d431-9932-4135-a8f1-51ee5e017775"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 00000000..85cbe05d
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+OutMacro = "0ae4d431-9932-4135-a8f1-51ee5e017775"
+WaterLily = "ed894a53-35f9-47f1-b17f-85db9237eebd"
diff --git a/benchmark/launch_bernchmarks.sh b/benchmark/launch_bernchmarks.sh
index 3dd85793..d91f4467 100644
--- a/benchmark/launch_bernchmarks.sh
+++ b/benchmark/launch_bernchmarks.sh
@@ -1,36 +1,55 @@
 #!/bin/bash
+# Usage example
+# sh launch_bernchmarks.sh  -v "1.8.5 1.10.0-rc1" --threads 6 --backends "Array CuArray" --cases "tgv.jl" --log2n "(3,4)"
+
+# Grep current julia version
+julia_version () {
+    julia_v=($(julia -v))
+    echo "${julia_v[2]}"
+}
 
 # Update project environment with new Julia version
 update_environment () {
-  echo "Updating environment to Julia v$version"
-  # juliaup default $version
-  julia --project -e "using Pkg; Pkg.update(); Pkg.precompile()"
+    echo "Updating environment to Julia v$version"
+    juliaup default $version
+    # Mark WaterLily as a development package. Then update dependencies and precompile.
+    julia --project -e "using Pkg; Pkg.develop(PackageSpec(path=join(split(pwd(), '/')[1:end-1], '/'))); Pkg.update();"
+}
+
+run_benchmark () {
+    echo "Running: julia --projects $args"
+    julia --project $args
 }
 
+# Print benchamrks info
 display_info () {
     echo "--------------------------------------"
     echo "Running benchmark tests for:
- - Julia:       ${VERSIONS[@]}
- - Backends:    ${BACKENDS[@]}"
+ - Julia:        ${VERSIONS[@]}
+ - Backends:     ${BACKENDS[@]}"
     if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
-        echo " - CPU threads: ${THREADS[@]}"
+        echo " - CPU threads:  ${THREADS[@]}"
     fi
-    echo " - Cases:       ${CASES[@]}
- - Size:        ${LOG2N[@]}
- - Sim. time:   ${TEND[@]}
- - Max. steps:  ${MAXSTEPS[@]}"
+    echo " - Cases:        ${CASES[@]}
+ - Size:         ${LOG2N[@]}
+ - Sim. time:    ${TEND[@]}
+ - Max. steps:   ${MAXSTEPS[@]}
+ - Data type:    ${DTYPE[@]}
+ - Num. samples: ${SAMPLES[@]}"
     echo "--------------------------------------"; echo
 }
 
-# Defaults
-# VERSIONS=('1.8.5'  '1.10.0-rc2')
-VERSIONS=('1.8.5')
+# Default backends
+VERSIONS=($(julia_version))
 BACKENDS=('Array' 'CuArray')
 THREADS=('1' '6')
+# Default cases. Arrays below must be same length (specify each case individually)
 CASES=('tgv.jl' 'donut.jl')
-LOG2N=('(4,5,6)' '(7,8,9)')
+LOG2N=('(5,6,7)' '(5,6,7)')
 TEND=('10.0' '10.0')
 MAXSTEPS=('100' '100')
+DTYPE=('Float32' 'Float32')
+SAMPLES=('1' '1')
 
 # Parse arguments
 while [ $# -gt 0 ]; do
@@ -56,13 +75,21 @@ case "$1" in
     shift
     ;;
     --t_end|-tend)
-    MAXSTEPS=($2)
+    TEND=($2)
     shift
     ;;
-    --max_steps|-ms)
+    --max_steps|-maxsteps)
     MAXSTEPS=($2)
     shift
     ;;
+    --data_type|-dtype)
+    DTYPE=($2)
+    shift
+    ;;
+    --samples|-s)
+    SAMPLES=($2)
+    shift
+    ;;
     *)
     printf "ERROR: Invalid argument\n"
     exit 1
@@ -84,19 +111,26 @@ display_info
 # Benchmarks
 for version in "${VERSIONS[@]}" ; do
     echo "Julia v$version benchmaks"
-    for backend in "${BACKENDS[@]}" ; do
-        if [ "${backend}" == "Array" ]; then
-            for thread in "${THREADS[@]}" ; do
-                args="-t $thread "
-            done
-        else
-            echo "Backend is not Array"
-        fi
-    done
-    # update_environment
-    for case in "${CASES[@]}" ; do
-        for log2n in "${LOG2N[@]}" ; do
-
+    update_environment
+    for i in "${!CASES[@]}"; do
+        args_case="${CASES[$i]} --log2n=${LOG2N[$i]} --t_end=${TEND[$i]} --max_steps=${MAXSTEPS[$i]} --dtype=${DTYPE[$i]} --samples=${SAMPLES[$i]}"
+        for backend in "${BACKENDS[@]}" ; do
+            if [ "${backend}" == "Array" ]; then
+                for thread in "${THREADS[@]}" ; do
+                    args="-t $thread "$args_case" --backend=$backend"
+                    run_benchmark
+                done
+            else
+                args=$args_case" --backend=$backend"
+                run_benchmark
+            fi
         done
     done
-done
\ No newline at end of file
+done
+
+# Run comparison [ToDo]
+
+
+# Restore julia system version to default one and exit
+juliaup default $(julia_version)
+exit 0
\ No newline at end of file

From 93b9dd6e6a8918edd352c8abba19c8f608eed972 Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Tue, 19 Dec 2023 02:18:19 +0100
Subject: [PATCH 3/9] Finished framework, added documentation in the bash file,
 and fixed subproject dependencies.

---
 .gitignore                      |   1 +
 benchmark/Project.toml          |   3 +-
 benchmark/benchmark.sh          | 162 ++++++++++++++++++++++++++++++++
 benchmark/compare.jl            |  25 +++++
 benchmark/launch_bernchmarks.sh | 136 ---------------------------
 benchmark/tgv.jl                |  22 ++---
 benchmark/util.jl               |  20 ++--
 7 files changed, 207 insertions(+), 162 deletions(-)
 create mode 100644 benchmark/benchmark.sh
 create mode 100644 benchmark/compare.jl
 delete mode 100644 benchmark/launch_bernchmarks.sh

diff --git a/.gitignore b/.gitignore
index a498f28e..98c5584a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.gif
 *.mp4
 *.dat
+*.json
 *.pdf
 *.vti
 *.pvd
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 85cbe05d..f1f63560 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,7 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-OutMacro = "0ae4d431-9932-4135-a8f1-51ee5e017775"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 WaterLily = "ed894a53-35f9-47f1-b17f-85db9237eebd"
diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
new file mode 100644
index 00000000..74505dfe
--- /dev/null
+++ b/benchmark/benchmark.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# ---- Automatic benchmark generation script
+# Allows to generate benchmark across different julia versions, backends, cases, and cases sizes.
+# juliaup is required: https://github.com/JuliaLang/juliaup
+#
+# Accepted arguments are (parenthesis for short version):
+#   - Backend arguments: --version(-v), --backends(-b) --threads(-t) [Julia version, backend types, number of threads (for Array backend)]
+#     These arguments accept a list of different parameters, for example:
+#         -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6"
+#     which would generate benchmark for all these combinations of parameters.
+#   - Case arguments: --cases(-c), --log2p(-p), --max_steps(-s), --ftype(-ft) [Benchmark case file, case sizes, number of time steps, float data type]
+#     The following arguments would generate benchmarks for the "tgv.jl" case:
+#         -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
+#     which in addition to the benchmark arguments, altogether can be used to launch this script as:
+#         sh benchmark.sh -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 3 6" -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
+#     Case arguments accept a list of parameters for each case, and the list index is shared across these arguments (hence lists must have equal length):
+#         -c "tgv.jl donut.jl" -p "5,6,7 7,8" -s "100 500" -ft "Float32 Float64"
+#     which would run the same benchmarks for the TGV as before, and benchmarks for the donut case too resulting into
+#         2 Julia versions x (2 Array + 1 CuArray) backends x (3 TGV sizes + 2 donut sizes) = 15 benchmarks
+#
+# Benchmarks are saved in JSON format with the following nomenclature:
+#     casename_sizes_maxsteps_ftype_backend_waterlilyHEADhash_juliaversion.json
+# Benchmarks can be finally compared using compare.jl as follows
+#     julia --project compare.jl benchmark_1.json benchmark_2.json benchmark_3.json ...
+# Note that each case benchmarks should be compared separately.
+# If a single case is benchmarked, and all the JSON files in the current directory belong to it, one can simply run:
+#     julia --project compare.jl $(find . -name "*.json" -printf "%T@ %Tc %p\n" | sort -n | awk '{print $8}')
+# which would take all the JSON files, sort them by creation time, and pass them as arguments to the compare.jl program.
+# Finally, note that the first benchmark passed as argument is taken as reference to compute speedups of other benchmarks:
+#     speedup_x = time(benchmark_1) / time(benchmark_x).
+#
+# TL;DR: Usage example
+#     sh benchmark.sh  -v "1.9.4 1.10.0-rc1" -t "1 3 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7"
+# The default launch is equivalent to:
+#     sh benchmark.sh  -v JULIA_DEFAULT -t "1 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7" -s 100 -ft Float32
+# ----
+
+
+# Grep current julia version
+julia_version () {
+    julia_v=($(julia -v))
+    echo "${julia_v[2]}"
+}
+
+# Update project environment with new Julia version
+update_environment () {
+    echo "Updating environment to Julia v$version"
+    juliaup default $version
+    # Mark WaterLily as a development package. Then update dependencies and precompile.
+    julia --project -e "using Pkg; Pkg.develop(PackageSpec(path=join(split(pwd(), '/')[1:end-1], '/'))); Pkg.update();"
+}
+
+run_benchmark () {
+    echo "Running: julia --project $args"
+    julia --project $args
+}
+
+# Print benchamrks info
+display_info () {
+    echo "--------------------------------------"
+    echo "Running benchmark tests for:
+ - Julia:        ${VERSIONS[@]}
+ - Backends:     ${BACKENDS[@]}"
+    if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
+        echo " - CPU threads:  ${THREADS[@]}"
+    fi
+    echo " - Cases:        ${CASES[@]}
+ - Size:         ${LOG2P[@]:0:$NCASES}
+ - Sim. steps:   ${MAXSTEPS[@]:0:$NCASES}
+ - Data type:    ${FTYPE[@]:0:$NCASES}"
+    echo "--------------------------------------"; echo
+}
+
+# Default backends
+DEFAULT_JULIA_VERSION=$(julia_version)
+VERSION=($DEFAULT_JULIA_VERSION)
+BACKENDS=('Array' 'CuArray')
+THREADS=('1' '6')
+# Default cases. Arrays below must be same length (specify each case individually)
+CASES=('tgv.jl')
+LOG2P=('5,6,7')
+MAXSTEPS=('100')
+FTYPE=('Float32')
+
+# Parse arguments
+while [ $# -gt 0 ]; do
+case "$1" in
+    --versions|-v)
+    VERSIONS=($2)
+    shift
+    ;;
+    --backends|-b)
+    BACKENDS=($2)
+    shift
+    ;;
+    --threads|-t)
+    THREADS=($2)
+    shift
+    ;;
+    --cases|-c)
+    CASES=($2)
+    shift
+    ;;
+    --log2p|-p)
+    LOG2P=($2)
+    shift
+    ;;
+    --max_steps|-s)
+    MAXSTEPS=($2)
+    shift
+    ;;
+    --float_type|-ft)
+    FTYPE=($2)
+    shift
+    ;;
+    *)
+    printf "ERROR: Invalid argument\n"
+    exit 1
+esac
+shift
+done
+
+NCASES=${#CASES[@]}
+
+# Assert "--threads" argument is not empy if "Array" backend is present
+if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
+    if [ "${#THREADS[@]}" == 0 ]; then
+        echo "ERROR: Backend 'Array' is present, but '--threads' argument is empty."
+        exit 1
+    fi
+fi
+
+# Display information
+display_info
+
+# Benchmarks
+for version in "${VERSIONS[@]}" ; do
+    echo "Julia v$version benchmaks"
+    update_environment
+    for i in "${!CASES[@]}"; do
+        args_case="${CASES[$i]} --log2p="${LOG2P[$i]}" --max_steps=${MAXSTEPS[$i]} --ftype=${FTYPE[$i]}"
+        for backend in "${BACKENDS[@]}" ; do
+            if [ "${backend}" == "Array" ]; then
+                for thread in "${THREADS[@]}" ; do
+                    args="-t $thread "$args_case" --backend=$backend"
+                    run_benchmark
+                done
+            else
+                args=$args_case" --backend=$backend"
+                run_benchmark
+            fi
+        done
+    done
+done
+
+# To compare all the benchmarks in this directory, run
+# julia --project compare.jl $(find . -name "*.json" -printf "%T@ %Tc %p\n" | sort -n | awk '{print $8}')
+
+# Restore julia system version to default one and exit
+juliaup default $DEFAULT_JULIA_VERSION
+echo "All done!"
+exit 0
\ No newline at end of file
diff --git a/benchmark/compare.jl b/benchmark/compare.jl
new file mode 100644
index 00000000..cd6af605
--- /dev/null
+++ b/benchmark/compare.jl
@@ -0,0 +1,25 @@
+using BenchmarkTools, PrettyTables
+
+# Load benchmarks
+benchmarks = [BenchmarkTools.load(f)[1] for f in ARGS]
+# Get backends string vector and assert same case sizes for the different backends
+backends_str = [String.(k)[1] for k in keys.(benchmarks)]
+log2p_str = [String.(keys(benchmarks[i][backend_str])) for (i, backend_str) in enumerate(backends_str)]
+@assert length(unique(log2p_str)) == 1
+# Assuming the case and tested function is the same in all benchmarks, we grab their name
+case, f_test = benchmarks[1].tags[1:2]
+# Get data for PrettyTables
+header = ["Backend", "WaterLily", "Julia", "Precision", "Allocations", "GC [%]", "Time [s]", "Speed-up"]
+data, base_speedup = Matrix{Any}(undef, length(benchmarks), length(header)), 1.0
+printstyled("Benchmark environment: $case $f_test (max_steps=$(benchmarks[1].tags[4]))\n", bold=true)
+for n in log2p_str[1]
+    printstyled("▶ log2p = $n\n", bold=true)
+    for (i, benchmark) in enumerate(benchmarks)
+        datap = benchmark[backends_str[i]][n][f_test]
+        speedup = i == 1 ? 1.0 : benchmarks[1][backends_str[1]][n][f_test].times[1] / datap.times[1]
+        data[i, :] .= [backends_str[i], benchmark.tags[end-1], benchmark.tags[end], benchmark.tags[end-3],
+            datap.allocs, (datap.gctimes[1] / datap.times[1]) * 100.0, datap.times[1] / 1e9, speedup]
+    end
+    pretty_table(data; header=header, header_alignment=:c, formatters=ft_printf("%.2f", [6,7,8]))
+end
+
diff --git a/benchmark/launch_bernchmarks.sh b/benchmark/launch_bernchmarks.sh
deleted file mode 100644
index d91f4467..00000000
--- a/benchmark/launch_bernchmarks.sh
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/bin/bash
-# Usage example
-# sh launch_bernchmarks.sh  -v "1.8.5 1.10.0-rc1" --threads 6 --backends "Array CuArray" --cases "tgv.jl" --log2n "(3,4)"
-
-# Grep current julia version
-julia_version () {
-    julia_v=($(julia -v))
-    echo "${julia_v[2]}"
-}
-
-# Update project environment with new Julia version
-update_environment () {
-    echo "Updating environment to Julia v$version"
-    juliaup default $version
-    # Mark WaterLily as a development package. Then update dependencies and precompile.
-    julia --project -e "using Pkg; Pkg.develop(PackageSpec(path=join(split(pwd(), '/')[1:end-1], '/'))); Pkg.update();"
-}
-
-run_benchmark () {
-    echo "Running: julia --projects $args"
-    julia --project $args
-}
-
-# Print benchamrks info
-display_info () {
-    echo "--------------------------------------"
-    echo "Running benchmark tests for:
- - Julia:        ${VERSIONS[@]}
- - Backends:     ${BACKENDS[@]}"
-    if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
-        echo " - CPU threads:  ${THREADS[@]}"
-    fi
-    echo " - Cases:        ${CASES[@]}
- - Size:         ${LOG2N[@]}
- - Sim. time:    ${TEND[@]}
- - Max. steps:   ${MAXSTEPS[@]}
- - Data type:    ${DTYPE[@]}
- - Num. samples: ${SAMPLES[@]}"
-    echo "--------------------------------------"; echo
-}
-
-# Default backends
-VERSIONS=($(julia_version))
-BACKENDS=('Array' 'CuArray')
-THREADS=('1' '6')
-# Default cases. Arrays below must be same length (specify each case individually)
-CASES=('tgv.jl' 'donut.jl')
-LOG2N=('(5,6,7)' '(5,6,7)')
-TEND=('10.0' '10.0')
-MAXSTEPS=('100' '100')
-DTYPE=('Float32' 'Float32')
-SAMPLES=('1' '1')
-
-# Parse arguments
-while [ $# -gt 0 ]; do
-case "$1" in
-    --versions|-v)
-    VERSIONS=($2)
-    shift
-    ;;
-    --backends|-b)
-    BACKENDS=($2)
-    shift
-    ;;
-    --threads|-t)
-    THREADS=($2)
-    shift
-    ;;
-    --cases|-c)
-    CASES=($2)
-    shift
-    ;;
-    --log2n|-log2n)
-    LOG2N=($2)
-    shift
-    ;;
-    --t_end|-tend)
-    TEND=($2)
-    shift
-    ;;
-    --max_steps|-maxsteps)
-    MAXSTEPS=($2)
-    shift
-    ;;
-    --data_type|-dtype)
-    DTYPE=($2)
-    shift
-    ;;
-    --samples|-s)
-    SAMPLES=($2)
-    shift
-    ;;
-    *)
-    printf "ERROR: Invalid argument\n"
-    exit 1
-esac
-shift
-done
-
-# Assert "Array" backend is present if "--threads" argument is passed
-if [[ " ${BACKENDS[*]} " =~ [[:space:]]'Array'[[:space:]] ]]; then
-    if [ "${#THREADS[@]}" == 0 ]; then
-        echo "ERROR: Backend 'Array' present, '--threads' argument is empty."
-        exit 1
-    fi
-fi
-
-# Display information
-display_info
-
-# Benchmarks
-for version in "${VERSIONS[@]}" ; do
-    echo "Julia v$version benchmaks"
-    update_environment
-    for i in "${!CASES[@]}"; do
-        args_case="${CASES[$i]} --log2n=${LOG2N[$i]} --t_end=${TEND[$i]} --max_steps=${MAXSTEPS[$i]} --dtype=${DTYPE[$i]} --samples=${SAMPLES[$i]}"
-        for backend in "${BACKENDS[@]}" ; do
-            if [ "${backend}" == "Array" ]; then
-                for thread in "${THREADS[@]}" ; do
-                    args="-t $thread "$args_case" --backend=$backend"
-                    run_benchmark
-                done
-            else
-                args=$args_case" --backend=$backend"
-                run_benchmark
-            fi
-        done
-    done
-done
-
-# Run comparison [ToDo]
-
-
-# Restore julia system version to default one and exit
-juliaup default $(julia_version)
-exit 0
\ No newline at end of file
diff --git a/benchmark/tgv.jl b/benchmark/tgv.jl
index 2a867c9e..fbac7a5f 100644
--- a/benchmark/tgv.jl
+++ b/benchmark/tgv.jl
@@ -2,15 +2,10 @@ using WaterLily
 using BenchmarkTools
 using CUDA: CuArray
 using KernelAbstractions: synchronize, get_backend
-using JLD2
-using OutMacro
 
 include("util.jl")
 
-log2n, t_end, max_steps, dtype, backend, samples = parse_cla(ARGS;
-    log2n=(5,6,7,8), t_end=1.0, max_steps=10, dtype=Float32, backend=Array, samples=5)
-evals = 5
-verbose = true
+log2p, max_steps, ftype, backend = parse_cla(ARGS; log2p=(5,6,7,8), max_steps=100, ftype=Float32, backend=Array)
 
 function TGV(p, backend; Re=1e5, T=Float32)
     # Define vortex size, velocity, viscosity
@@ -27,13 +22,14 @@ function TGV(p, backend; Re=1e5, T=Float32)
 end
 
 function benchmark()
-    suite, results = BenchmarkGroup(), BenchmarkGroup()
-    sim_step!(TGV(log2n[1], backend; T=dtype), t_end; max_steps=1, verbose=true, remeasure=false) # warm up
-    add_to_suite!(suite, TGV; log2n=log2n, t_end=t_end, max_steps=max_steps, dtype=dtype, backend=backend) # create benchmark
-    # tune!(suite)
-    results[backend_str[backend]] = run(suite[backend_str[backend]], samples=samples, evals=evals, seconds=1e6, verbose=verbose) # run!
-    fname = string(@__DIR__)*"/tgv_simstep_p$(log2n...)_$(backend_str[backend])_v$VERSION.dat"
-    save_object(fname, results) # save benchmark
+    suite = BenchmarkGroup()
+    results = BenchmarkGroup(["TGV", "sim_step!", log2p, max_steps, ftype, backend_str[backend], git_hash, string(VERSION)])
+    sim_step!(TGV(log2p[1], backend; T=ftype), typemax(ftype); max_steps=1, verbose=false, remeasure=false) # warm up
+    add_to_suite!(suite, TGV; log2p=log2p, max_steps=max_steps, ftype=ftype, backend=backend) # create benchmark
+    results[backend_str[backend]] = run(suite[backend_str[backend]], samples=1, evals=1, seconds=1e6, verbose=true) # run!
+    fname = string(@__DIR__) * "/" *  split(PROGRAM_FILE, '.')[1] *
+        "_$(log2p...)_$(max_steps)_$(ftype)_$(backend_str[backend])_$(git_hash)_$VERSION.json"
+    BenchmarkTools.save(fname, results)
 end
 
 benchmark()
\ No newline at end of file
diff --git a/benchmark/util.jl b/benchmark/util.jl
index bcb8fa5c..21d37c5c 100644
--- a/benchmark/util.jl
+++ b/benchmark/util.jl
@@ -1,15 +1,13 @@
-function parse_cla(args; log2n=(2,3,4), t_end=1.0, max_steps=10, dtype=Float32, backend=Array, samples=1)
+function parse_cla(args; log2p=(2,3,4), max_steps=10, ftype=Float32, backend=Array)
     iarg(arg) = occursin.(arg, args) |> findfirst
     parse_tuple(T, s) = Tuple(parse.(T, split(strip(s, ['(', ')', ' ']), ','; keepempty=false)))
     arg_value(arg) = split(args[iarg(arg)], "=")[end]
 
-    log2n = !isnothing(iarg("log2n")) ? arg_value("log2n") |> x -> parse_tuple(Int, x) : log2n
-    t_end = !isnothing(iarg("t_end")) ? arg_value("t_end") |> x -> parse(Float64, x) : t_end
+    log2p = !isnothing(iarg("log2p")) ? arg_value("log2p") |> x -> parse_tuple(Int, x) : log2p
     max_steps = !isnothing(iarg("max_steps")) ? arg_value("max_steps") |> x -> parse(Int, x) : max_steps
-    dtype = !isnothing(iarg("dtype")) ? arg_value("dtype") |> x -> eval(Symbol(x)) : dtype
+    ftype = !isnothing(iarg("ftype")) ? arg_value("ftype") |> x -> eval(Symbol(x)) : ftype
     backend = !isnothing(iarg("backend")) ? arg_value("backend") |> x -> eval(Symbol(x)) : backend
-    samples = !isnothing(iarg("sampels")) ? arg_value("samples") |> x -> parse(Int, x) : samples
-    return log2n, t_end, max_steps, dtype, backend, samples
+    return log2p, max_steps, ftype, backend
 end
 
 macro add_benchmark(args...)
@@ -24,15 +22,15 @@ end
 
 backend_str = Dict(Array => "CPUx$(Threads.nthreads())", CuArray => "GPU")
 
-function add_to_suite!(suite, sim_function; log2n=(3,4,5), t_end=t_end, max_steps=max_steps, dtype=Float32, backend=Array)
+function add_to_suite!(suite, sim_function; log2p=(3,4,5), max_steps=max_steps, ftype=Float32, backend=Array)
     bstr = backend_str[backend]
     suite[bstr] = BenchmarkGroup([bstr])
-    for n in log2n
-        sim = sim_function(n, backend; T=dtype)
+    for n in log2p
+        sim = sim_function(n, backend; T=ftype)
         suite[bstr][repr(n)] = BenchmarkGroup([repr(n)])
-        @add_benchmark sim_step!($sim, $t_end; max_steps=$max_steps, verbose=true, remeasure=false) $(get_backend(sim.flow.p)) suite[bstr][repr(n)] "sim_step!"
+        @add_benchmark sim_step!($sim, $typemax(ftype); max_steps=$max_steps, verbose=false, remeasure=false) $(get_backend(sim.flow.p)) suite[bstr][repr(n)] "sim_step!"
     end
 end
 
-git_hash() = read(`git rev-parse --short HEAD`, String) |> x -> strip(x, '\n')
+git_hash = read(`git rev-parse --short HEAD`, String) |> x -> strip(x, '\n')
 

From ddb2e85f1b896e447773521eadc98e71289ac860 Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Tue, 19 Dec 2023 02:19:06 +0100
Subject: [PATCH 4/9] Deleted old TGV benchmarks.

---
 benchmark/tgv/tgv.jl        | 107 ------------------------------------
 benchmark/tgv/tgv_serial.jl |  38 -------------
 2 files changed, 145 deletions(-)
 delete mode 100644 benchmark/tgv/tgv.jl
 delete mode 100644 benchmark/tgv/tgv_serial.jl

diff --git a/benchmark/tgv/tgv.jl b/benchmark/tgv/tgv.jl
deleted file mode 100644
index b4087d7c..00000000
--- a/benchmark/tgv/tgv.jl
+++ /dev/null
@@ -1,107 +0,0 @@
-using WaterLily
-using LinearAlgebra: norm2
-using BenchmarkTools
-using CUDA: CuArray
-using KernelAbstractions: synchronize, get_backend
-using JLD2
-
-macro add_benchmark(args...)
-    ex, b, suite, label = args
-    return quote
-        $suite[$label] = @benchmarkable begin
-            $ex
-            synchronize($b)
-        end
-    end |> esc
-end
-
-function TGV(p, backend; Re=1e5, T=Float32)
-    # Define vortex size, velocity, viscosity
-    L = 2^p; U = 1; ν = U*L/Re
-    # Taylor-Green-Vortex initial velocity field
-    function uλ(i,vx)
-        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
-        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
-        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
-        return 0.                              # u_z
-    end
-    # Initialize simulation
-    return Simulation((L, L, L), (0, 0, 0), L; U=U, uλ=uλ, ν=ν, T=T, mem=backend)
-end
-
-function create_suite()
-    suite = BenchmarkGroup()
-    for (ArrayT, b) ∈ zip([Array, CuArray], backends_str)
-        suite[b] = BenchmarkGroup([b])
-        for n ∈ log2N
-            sim = TGV(n, ArrayT; T=T)
-            backend = get_backend(sim.flow.p)
-            suite[b][repr(n)] = BenchmarkGroup([repr(n)])
-            @add_benchmark sim_step!($sim, $t_sim_CTU; verbose=true, remeasure=false) $backend suite[b][repr(n)] "sim_step!"
-        end
-    end
-    return suite
-end
-
-log2N, t_sim_CTU, T = (5, 6, 7, 8), 0.1, Float32
-
-backends_str = ["CPU", "GPU"]
-r = BenchmarkGroup()
-samples = 1 # We can't only use >1 samples since flow reaches flow.time on the first one and does not iterate further.
-evals = 1
-verbose = true
-save_benchmark = false
-run_benchmarks = false
-
-# Run or load benchmarks
-if run_benchmarks
-    # Force first run to compile
-    simCPU = TGV(4, Array; T=T)
-    sim_step!(simCPU, t_sim_CTU; verbose=true, remeasure=false)
-    simGPU = TGV(4, CuArray; T=T)
-    sim_step!(simGPU, t_sim_CTU; verbose=true, remeasure=false)
-    # Create benchmark suite
-    suite = create_suite()
-    r["CPU"] = run(suite["CPU"], samples = samples, evals = evals, seconds = 1e6, verbose = verbose)
-    r["GPU"] = run(suite["GPU"], samples = samples, evals = evals,  seconds = 1e6, verbose = verbose)
-    # save_benchmark && save_object("benchmark/tgv/sim_step_5678_update_mult_1.9.2.dat", r)
-    save_benchmark && save_object("benchmark/tgv/sim_step_5678_master_1.9.2.dat", r)
-else
-    # r = load_object("benchmark/tgv/sim_step_5678_update_mult_1.9.2.dat")
-    r = load_object("benchmark/tgv/sim_step_5678_master_1.9.2.dat")
-end
-# Serial (master) benchmarks
-# r["serial"] = load_object("benchmark/tgv/sim_step_5678_serial_1.8_old.dat")
-r["serial"] = load_object("benchmark/tgv/sim_step_5678_serial_1.8.5.dat")
-
-# Postprocess results
-push!(backends_str, "serial")
-btimes = Dict((b, Dict((n, 0.0) for n ∈ repr.(log2N))) for b ∈ backends_str)
-for b ∈ backends_str, n ∈ repr.(log2N)
-    btimes[b][n]= r[b][n]["sim_step!"] |> time # only single sample
-    btimes[b][n]/= 10^9 # times now in ms
-end
-btimes_sim_step = (serial = T[btimes["serial"][n] for n ∈ repr.(log2N)],
-                      CPU = T[btimes["CPU"][n] for n ∈ repr.(log2N)],
-                      GPU = T[btimes["GPU"][n] for n ∈ repr.(log2N)])
-
-# speedups
-using Printf
-println("\nSpeedups:\n n  |   routine  |  CPU   |  GPU\n----------------------------------")
-for n ∈ repr.(log2N)
-    @printf("n=%s | %10s | %06.2f | %06.2f\n",
-        n, "sim_step!", btimes["serial"][n]/btimes["CPU"][n], btimes["serial"][n]/btimes["GPU"][n])
-end
-
-# Plots
-# using Plots, LaTeXStrings
-# p1 = plot(size=(600,600), xlabel=L"\log_2(N)", ylabel="TGV sim_step! "* L"[s]",
-#     yscale=:log10, legend=:bottomright, foreground_color_legend=nothing, legendfontsize=12,
-#     yticks=[10.0^n for n in -1:2], markerstrokewidth=0)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:serial], label="serial", marker=4, color=:red, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:CPU], label="CPU", marker=4, color=:blue, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:GPU], label="GPU", marker=4, color=:green, markerstrokewidth=0.25)
-
-# Plots.scalefontsizes(1.5)
-# savefig("benchmark/tgv/tgv_benchmark.pdf");
-# Plots.scalefontsizes()
\ No newline at end of file
diff --git a/benchmark/tgv/tgv_serial.jl b/benchmark/tgv/tgv_serial.jl
deleted file mode 100644
index d6eadcb2..00000000
--- a/benchmark/tgv/tgv_serial.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using WaterLily
-using BenchmarkTools
-using JLD2
-
-function TGV(p; Re=1e5, T=Float32)
-    # Define vortex size, velocity, viscosity
-    L = 2^p; U = 1; ν = U*L/Re
-    # Taylor-Green-Vortex initial velocity field
-    function uλ(i,vx)
-        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
-        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
-        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
-        return 0.                              # u_z
-    end
-    # Initialize simulation
-    return Simulation((L+2,L+2,L+2),zeros(3),L;U,uλ,ν,T)
-end
-
-log2N, t_sim_CTU, T = (5, 6, 7, 8), 0.1, Float32
-# log2N, t_sim_CTU, T = (5,), 0.1, Float32
-
-# Force first run to compile
-sim_temp = TGV(5; T=T)
-sim_step!(sim_temp, t_sim_CTU; verbose=true, remeasure=false)
-
-suite = BenchmarkGroup()
-for n ∈ log2N
-    suite[repr(n)] = BenchmarkGroup([repr(n)])
-    sim = TGV(n; T=T)
-    suite[repr(n)]["sim_step!"] = @benchmarkable sim_step!($sim, $t_sim_CTU; verbose=true, remeasure=false)
-end
-
-# Run benchmarks
-samples = 1 # We can only use 1 sample since more than once used that last flow.time and does not iterate further.
-evals = 1 # better to use evaulations instead
-verbose = true
-r = run(suite, samples = samples, evals = evals, seconds = 1e6, verbose = verbose)
-save_object("benchmark/tgv/sim_step_5678_serial.dat", r)
\ No newline at end of file

From 16f286e82c3de47c7b885f0a9bfdae2422a9e57b Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Tue, 19 Dec 2023 02:31:08 +0100
Subject: [PATCH 5/9] Removed old mom_step and donut directories. To be done
 again for the new framework.

---
 benchmark/donut/donut.jl              | 107 -----------------------
 benchmark/donut/donut_serial.jl       |  52 ------------
 benchmark/mom_step/Project.toml       |   4 -
 benchmark/mom_step/mom_step.jl        | 118 --------------------------
 benchmark/mom_step/mom_step_serial.jl |  39 ---------
 5 files changed, 320 deletions(-)
 delete mode 100644 benchmark/donut/donut.jl
 delete mode 100644 benchmark/donut/donut_serial.jl
 delete mode 100644 benchmark/mom_step/Project.toml
 delete mode 100644 benchmark/mom_step/mom_step.jl
 delete mode 100644 benchmark/mom_step/mom_step_serial.jl

diff --git a/benchmark/donut/donut.jl b/benchmark/donut/donut.jl
deleted file mode 100644
index 72614ad7..00000000
--- a/benchmark/donut/donut.jl
+++ /dev/null
@@ -1,107 +0,0 @@
-using WaterLily
-using LinearAlgebra: norm2
-using BenchmarkTools
-using CUDA: CuArray
-using KernelAbstractions: synchronize, get_backend
-using JLD2
-
-macro add_benchmark(args...)
-    ex, b, suite, label = args
-    return quote
-        $suite[$label] = @benchmarkable begin
-            $ex
-            synchronize($b)
-        end
-    end |> esc
-end
-
-function donut(p, backend; Re=1e3, T=Float32)
-    # Define simulation size, geometry dimensions, viscosity
-    n = 2^p
-    center,R,r = (n/2,n/2,n/2), n/4, n/16
-    ν = R/Re
-    # Apply signed distance function for a torus
-    body = AutoBody() do xyz, t
-        x,y,z = xyz .- center
-        √sum(abs2,(x, √sum(abs2, (y, z)) - R))
-    end
-    # Initialize simulation
-    Simulation((2n, n, n), (1, 0, 0), R; ν=ν, body=body, T=T, mem=backend)
-end
-
-function create_suite()
-    suite = BenchmarkGroup()
-    for (ArrayT, b) ∈ zip([Array, CuArray], backends_str)
-        suite[b] = BenchmarkGroup([b])
-        for n ∈ log2N
-            sim = donut(n, ArrayT; T=T)
-            backend = get_backend(sim.flow.p)
-            suite[b][repr(n)] = BenchmarkGroup([repr(n)])
-            @add_benchmark sim_step!($sim, $t_sim_CTU; verbose=true, remeasure=false) $backend suite[b][repr(n)] "sim_step!"
-        end
-    end
-    return suite
-end
-
-log2N, t_sim_CTU, T = (4, 5, 6, 7), 0.1, Float32
-
-backends_str = ["CPU", "GPU"]
-r = BenchmarkGroup()
-samples = 1 # We can't only use >1 samples since flow reaches flow.time on the first one and does not iterate further.
-evals = 1
-verbose = true
-save_benchmark = false
-run_benchmarks = false
-
-# Run or load benchmarks
-if run_benchmarks
-    # Force first run to compile
-    simCPU = donut(3, Array; T=T)
-    sim_step!(simCPU, t_sim_CTU; verbose=true, remeasure=false)
-    simGPU = donut(3, CuArray; T=T)
-    sim_step!(simGPU, t_sim_CTU; verbose=true, remeasure=false)
-    # Create benchmark suite
-    suite = create_suite()
-    r["CPU"] = run(suite["CPU"], samples = samples, evals = evals, seconds = 1e6, verbose = verbose)
-    r["GPU"] = run(suite["GPU"], samples = samples, evals = evals,  seconds = 1e6, verbose = verbose)
-    # save_benchmark && save_object("benchmark/donut/sim_step_4567_update_mult_1.9.2.dat", r)
-    save_benchmark && save_object("benchmark/donut/sim_step_4567_master_1.9.2.dat", r)
-else
-    # r = load_object("benchmark/donut/sim_step_4567_update_mult_1.9.2.dat")
-    r = load_object("benchmark/donut/sim_step_4567_master_1.9.2.dat")
-end
-# Serial (master) benchmarks
-r["serial"] = load_object("benchmark/donut/sim_step_4567_serial_1.8.5.dat")
-# r["serial"] = load_object("benchmark/donut/sim_step_4567_serial_1.8_old.dat")
-
-# Postprocess results
-push!(backends_str, "serial")
-btimes = Dict((b, Dict((n, 0.0) for n ∈ repr.(log2N))) for b ∈ backends_str)
-for b ∈ backends_str, n ∈ repr.(log2N)
-    btimes[b][n]= r[b][n]["sim_step!"] |> time # only single sample
-    btimes[b][n]/= 10^6 # times now in ms
-end
-btimes_sim_step = (serial = T[btimes["serial"][n] for n ∈ repr.(log2N)],
-                      CPU = T[btimes["CPU"][n] for n ∈ repr.(log2N)],
-                      GPU = T[btimes["GPU"][n] for n ∈ repr.(log2N)])
-
-# speedups
-using Printf
-println("\nSpeedups:\n n  |   routine  |  CPU   |  GPU\n----------------------------------")
-for n ∈ repr.(log2N)
-    @printf("n=%s | %10s | %06.2f | %06.2f\n",
-        n, "sim_step!", btimes["serial"][n]/btimes["CPU"][n], btimes["serial"][n]/btimes["GPU"][n])
-end
-
-# Plots
-# using Plots, LaTeXStrings
-# p1 = plot(size=(600,600), xlabel=L"\log_2(N)", ylabel="Donut sim_step! "* L"[ms]",
-#     yscale=:log10, legend=:bottomright, foreground_color_legend=nothing,
-#     yticks=[10.0^n for n in 2:5])
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:serial], label="serial", marker=4, color=:red)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:CPU], label="CPU", marker=4, color=:blue)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_sim_step[:GPU], label="GPU", marker=4, color=:green)
-
-# Plots.scalefontsizes(1.5)
-# savefig("benchmark/donut/donut_benchmark.pdf");
-# Plots.scalefontsizes()
\ No newline at end of file
diff --git a/benchmark/donut/donut_serial.jl b/benchmark/donut/donut_serial.jl
deleted file mode 100644
index 1e33194f..00000000
--- a/benchmark/donut/donut_serial.jl
+++ /dev/null
@@ -1,52 +0,0 @@
-using WaterLily
-using BenchmarkTools
-using JLD2
-using LinearAlgebra: norm2
-
-function TGV(p; Re=1e5, T=Float32)
-    # Define vortex size, velocity, viscosity
-    L = 2^p; U = 1; ν = U*L/Re
-    # Taylor-Green-Vortex initial velocity field
-    function uλ(i,vx)
-        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
-        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
-        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
-        return 0.                              # u_z
-    end
-    # Initialize simulation
-    return Simulation((L+2,L+2,L+2),zeros(3),L;U,uλ,ν,T)
-end
-
-function donut(p; Re=1e3, T=Float32)
-    # Define simulation size, geometry dimensions, viscosity
-    n = 2^p
-    center,R,r = [n/2,n/2,n/2], n/4, n/16
-    ν = R/Re
-    # Apply signed distance function for a torus
-    body = AutoBody() do xyz,t
-        x,y,z = xyz - center
-        norm2([x,norm2([y,z])-R])-r
-    end
-    return Simulation((2n+2,n+2,n+2),[1.,0.,0.],R;ν,body,T)
-end
-
-log2N, t_sim_CTU, T = (4, 5, 6, 7), 0.1, Float32
-
-# Force first run to compile
-sim_temp = donut(4; T=T)
-sim_step!(sim_temp, t_sim_CTU; verbose=true, remeasure=false)
-
-# Create benchmark suite
-suite = BenchmarkGroup()
-for n ∈ log2N
-    suite[repr(n)] = BenchmarkGroup([repr(n)])
-    sim = donut(n; T=T)
-    suite[repr(n)]["sim_step!"] = @benchmarkable sim_step!($sim, $t_sim_CTU; verbose=true, remeasure=false)
-end
-
-# Run benchmarks
-samples = 1 # We can only use 1 sample since more than once used that last flow.time and does not iterate further.
-evals = 1 # better to use evaulations instead
-verbose = true
-r = run(suite, samples = samples, evals = evals, seconds = 1e6, verbose = verbose)
-save_object("benchmark/donut/sim_step_4567_serial.dat", r)
\ No newline at end of file
diff --git a/benchmark/mom_step/Project.toml b/benchmark/mom_step/Project.toml
deleted file mode 100644
index 6edab7ee..00000000
--- a/benchmark/mom_step/Project.toml
+++ /dev/null
@@ -1,4 +0,0 @@
-[deps]
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-WaterLily = "ed894a53-35f9-47f1-b17f-85db9237eebd"
diff --git a/benchmark/mom_step/mom_step.jl b/benchmark/mom_step/mom_step.jl
deleted file mode 100644
index 966826d6..00000000
--- a/benchmark/mom_step/mom_step.jl
+++ /dev/null
@@ -1,118 +0,0 @@
-using WaterLily
-using BenchmarkTools
-using CUDA: CuArray
-using KernelAbstractions: synchronize, get_backend
-using JLD2
-
-macro add_benchmark(args...)
-    ex, b, suite, label = args
-    return quote
-        $suite[$label] = @benchmarkable begin
-            $ex
-            synchronize($b)
-        end
-    end |> esc
-end
-
-function create_suite()
-    suite = BenchmarkGroup()
-    for (ArrayT, b) ∈ zip([Array, CuArray], backends_str)
-        suite[b] = BenchmarkGroup([b])
-        for n ∈ log2N
-            flow = TGV(n, ArrayT;  T=T)
-            pois = MultiLevelPoisson(flow.p, flow.μ₀, flow.σ)
-            backend = get_backend(flow.p)
-            suite[b][repr(n)] = BenchmarkGroup([repr(n)])
-            @add_benchmark WaterLily.conv_diff!($flow.f, $flow.u⁰, $flow.σ, ν=$flow.ν) $backend suite[b][repr(n)] "conv_diff!"
-            @add_benchmark WaterLily.BDIM!($flow) $backend suite[b][repr(n)] "BDIM!"
-            @add_benchmark BC!($flow.u, $flow.U) $backend suite[b][repr(n)] "BC!"
-            @add_benchmark WaterLily.project!($flow, $pois) $backend suite[b][repr(n)] "project!"
-            @add_benchmark WaterLily.CFL($flow) $backend suite[b][repr(n)] "CFL"
-        end
-    end
-    return suite
-end
-
-function TGV(p, backend; Re=1e5, T=Float32)
-    # Define vortex size, velocity, viscosity
-    L = 2^p; U = 1; ν = U*L/Re
-    # Taylor-Green-Vortex initial velocity field
-    function uλ(i,vx)
-        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
-        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
-        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
-        return 0.                              # u_z
-    end
-    # Initialize simulation
-    return Flow((L, L, L), (0, 0, 0); f=backend, ν=ν, uλ=uλ, T=T)
-end
-
-log2N = (5, 6, 7, 8)
-U, T = (0, 0, 0), Float32
-
-backends_str = ["CPU", "GPU"]
-r = BenchmarkGroup()
-samples = 100 # Use >1 since timings reported are min(samples), and the first run always compiles
-verbose = true
-save_benchmark = false
-run_benchmarks = false
-
-# Run or load benchmarks
-if run_benchmarks
-    suite = create_suite()
-    r["CPU"] = run(suite["CPU"], samples = samples, seconds = 1e6, verbose = verbose)
-    r["GPU"] = run(suite["GPU"], samples = samples, seconds = 1e6, verbose = verbose)
-    # save_benchmark && save_object("benchmark/mom_step/mom_step_5678_master.dat", r)
-    save_benchmark && save_object("benchmark/mom_step/mom_step_5678_update_mult.dat", r)
-else
-    # r = load_object("benchmark/mom_step/mom_step_5678_master.dat")
-    r = load_object("benchmark/mom_step/mom_step_5678_update_mult.dat")
-end
-# Serial (master) benchmarks
-r["serial"] = load_object("benchmark/mom_step/mom_step_5678_serial_1.8.dat")
-
-# Postprocess results
-routines = ["conv_diff!", "BDIM!", "BC!", "project!", "CFL"]
-push!(backends_str, "serial")
-btimes = Dict((b, Dict((n, Dict()) for n ∈ repr.(log2N))) for b ∈ backends_str)
-for b ∈ backends_str, n ∈ repr.(log2N), f ∈ routines
-    btimes[b][n][f] = r[b][n][f][2:end] |> minimum |> time # throw out first sample
-    btimes[b][n][f] /= 10^6 # times now in ms
-end
-btimes_conv_diff = (serial = T[btimes["serial"][n]["conv_diff!"] for n ∈ repr.(log2N)],
-                      CPU = T[btimes["CPU"][n]["conv_diff!"] for n ∈ repr.(log2N)],
-                      GPU = T[btimes["GPU"][n]["conv_diff!"] for n ∈ repr.(log2N)])
-btimes_project = (serial = T[btimes["serial"][n]["project!"] for n ∈ repr.(log2N)],
-                    CPU = T[btimes["CPU"][n]["project!"] for n ∈ repr.(log2N)],
-                    GPU = T[btimes["GPU"][n]["project!"] for n ∈ repr.(log2N)])
-
-# speedups
-using Printf
-println("\nSpeedups:\n n  |   routine  |  CPU   |  GPU\n----------------------------------")
-for n ∈ repr.(log2N), f ∈ routines
-    @printf("n=%s | %10s | %06.2f | %06.2f\n",
-        n, f, btimes["serial"][n][f]/btimes["CPU"][n][f], btimes["serial"][n][f]/btimes["GPU"][n][f])
-end
-
-# Plots
-# using Plots, LaTeXStrings
-# p1 = plot(size=(600,600), xlabel=L"\log_2(N)", ylabel="TGV conv_diff! "* L"[ms]",
-#     yscale=:log10, legend=:bottomright, foreground_color_legend=nothing, legendfontsize=12,
-#     yticks=[10.0^n for n in 0:3])
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_conv_diff[:serial], label="serial", marker=4, color=:red, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_conv_diff[:CPU], label="CPU", marker=4, color=:blue, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_conv_diff[:GPU], label="GPU", marker=4, color=:green, markerstrokewidth=0.25)
-# Plots.scalefontsizes(1.5)
-# savefig("benchmark/mom_step/benchmark_tgv_conv_diff.pdf");
-# Plots.scalefontsizes()
-
-# p1 = plot(size=(600,600), xlabel=L"\log_2(N)", ylabel="TGV project! "* L"[ms]",
-#     yscale=:log10, legend=:bottomright, foreground_color_legend=nothing, legendfontsize=12,
-#     yticks=[10.0^n for n in 0:3])
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_project[:serial], label="serial", marker=4, color=:red, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_project[:CPU], label="CPU", marker=4, color=:blue, markerstrokewidth=0.25)
-# plot!([n for n ∈ repr.(log2N.*3)], btimes_project[:GPU], label="GPU", marker=4, color=:green, markerstrokewidth=0.25)
-# Plots.scalefontsizes(1.5)
-# savefig("benchmark/mom_step/benchmark_tgv_project.pdf");
-# Plots.scalefontsizes()
-
diff --git a/benchmark/mom_step/mom_step_serial.jl b/benchmark/mom_step/mom_step_serial.jl
deleted file mode 100644
index c5fbffa9..00000000
--- a/benchmark/mom_step/mom_step_serial.jl
+++ /dev/null
@@ -1,39 +0,0 @@
-using WaterLily
-using BenchmarkTools
-using JLD2
-
-function TGV(p; Re=1e5, T=Float32)
-    # Define vortex size, velocity, viscosity
-    L = 2^p; U = 1; ν = U*L/Re
-    # Taylor-Green-Vortex initial velocity field
-    function uλ(i,vx)
-        x,y,z = @. (vx-1.5)*π/L                # scaled coordinates
-        i==1 && return -U*sin(x)*cos(y)*cos(z) # u_x
-        i==2 && return  U*cos(x)*sin(y)*cos(z) # u_y
-        return 0.                              # u_z
-    end
-    # Initialize simulation
-    return Flow((L+2,L+2,L+2),zeros(3); ν, uλ, T)
-end
-
-log2N = (5, 6, 7, 8)
-T = Float32
-U = T[0.0, 0.0, 0.0]
-
-suite = BenchmarkGroup()
-for n ∈ log2N
-    flow = TGV(n; T=T)
-    pois = MultiLevelPoisson(flow.μ₀)
-    suite[repr(n)] = BenchmarkGroup([repr(n)])
-    suite[repr(n)]["conv_diff!"] = @benchmarkable WaterLily.conv_diff!($flow.f, $flow.u⁰, ν=$flow.ν)
-    suite[repr(n)]["BDIM!"] = @benchmarkable WaterLily.BDIM!($flow)
-    suite[repr(n)]["BC!"] = @benchmarkable BC!($flow.u, $flow.U)
-    suite[repr(n)]["project!"] = @benchmarkable WaterLily.project!($flow, $pois)
-    suite[repr(n)]["CFL"] = @benchmarkable WaterLily.CFL($flow)
-end
-
-# Run benchmarks
-samples = 100 # Use >1 since timings reported are min(samples), and the first run always compiles
-verbose = true
-r = run(suite, samples = samples, seconds = 1e6, verbose = verbose)
-save_object("benchmark/mom_step/mom_step_5678_serial.dat", r)
\ No newline at end of file

From 1cc8c3ff4e84a9a3967103445b778f7ea131847f Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Thu, 21 Dec 2023 09:18:22 +0100
Subject: [PATCH 6/9] Fixed typo in docs.

---
 benchmark/benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
index 74505dfe..a5192bc9 100644
--- a/benchmark/benchmark.sh
+++ b/benchmark/benchmark.sh
@@ -12,11 +12,11 @@
 #     The following arguments would generate benchmarks for the "tgv.jl" case:
 #         -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
 #     which in addition to the benchmark arguments, altogether can be used to launch this script as:
-#         sh benchmark.sh -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 3 6" -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
+#         sh benchmark.sh -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6" -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
 #     Case arguments accept a list of parameters for each case, and the list index is shared across these arguments (hence lists must have equal length):
 #         -c "tgv.jl donut.jl" -p "5,6,7 7,8" -s "100 500" -ft "Float32 Float64"
 #     which would run the same benchmarks for the TGV as before, and benchmarks for the donut case too resulting into
-#         2 Julia versions x (2 Array + 1 CuArray) backends x (3 TGV sizes + 2 donut sizes) = 15 benchmarks
+#         2 Julia versions x (2 Array + 1 CuArray) backends x (3 TGV sizes + 2 donut sizes) = 30 benchmarks
 #
 # Benchmarks are saved in JSON format with the following nomenclature:
 #     casename_sizes_maxsteps_ftype_backend_waterlilyHEADhash_juliaversion.json

From 32e2463910a852a87633f4873af02b0da7bb8a2a Mon Sep 17 00:00:00 2001
From: Bernat Font <17761372+b-fg@users.noreply.github.com>
Date: Thu, 28 Dec 2023 18:20:24 +0100
Subject: [PATCH 7/9] Improved error verbose in  benchmark.sh as suggested by
 @giordano
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mosè Giordano <giordano@users.noreply.github.com>
---
 benchmark/benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
index a5192bc9..a932287c 100644
--- a/benchmark/benchmark.sh
+++ b/benchmark/benchmark.sh
@@ -114,7 +114,7 @@ case "$1" in
     shift
     ;;
     *)
-    printf "ERROR: Invalid argument\n"
+    printf "ERROR: Invalid argument %s\n" "${1}" 1>&2
     exit 1
 esac
 shift

From 3e53d2fc590b53d28a7d1882b1da21960709bc2f Mon Sep 17 00:00:00 2001
From: Bernat Font <17761372+b-fg@users.noreply.github.com>
Date: Thu, 28 Dec 2023 20:53:16 +0100
Subject: [PATCH 8/9] Added newline at end of file.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mosè Giordano <giordano@users.noreply.github.com>
---
 benchmark/benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
index a932287c..d5fca237 100644
--- a/benchmark/benchmark.sh
+++ b/benchmark/benchmark.sh
@@ -159,4 +159,4 @@ done
 # Restore julia system version to default one and exit
 juliaup default $DEFAULT_JULIA_VERSION
 echo "All done!"
-exit 0
\ No newline at end of file
+exit 0

From ba88cc981ec8dcff4352d5fabc500a90f609e702 Mon Sep 17 00:00:00 2001
From: Bernat Font <bernatfontgarcia@gmail.com>
Date: Thu, 28 Dec 2023 21:41:22 +0100
Subject: [PATCH 9/9] Added suggestions by @giordano:  - juliaup now uses the
 +{version} approach instead of modifying the default juliaup version  -
 Pkg.develop now uses dirname(@__DIR__)

Also moved docs to the new README.md file to improve readability.
---
 benchmark/README.md    | 45 ++++++++++++++++++++++++++++++++++++
 benchmark/benchmark.sh | 52 ++++--------------------------------------
 2 files changed, 50 insertions(+), 47 deletions(-)
 create mode 100644 benchmark/README.md

diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..aebd35ba
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,45 @@
+# Automatic benchmark generation suite
+
+Suite to generate benchmarks across different Julia versions (using [juliaup](https://github.com/JuliaLang/juliaup)), backends, cases, and cases sizes using the [benchmark.sh](./benchmark.sh) script.
+
+## TL;DR
+Usage example
+```
+sh benchmark.sh  -v "1.9.4 1.10.0-rc1" -t "1 3 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7"
+```
+The default launch is equivalent to:
+```
+sh benchmark.sh  -v JULIA_USER_VERSION -t "1 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7" -s 100 -ft Float32
+```
+
+## Usage information
+
+The accepted command line arguments are (parenthesis for short version):
+ - Backend arguments: `--version(-v)`, `--backends(-b)`, `--threads(-t)`. Respectively: Julia version, backend types, number of threads (for Array backend). These arguments accept a list of different parameters, for example:
+    ```
+    -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6"
+    ```
+    which would generate benchmark for all these combinations of parameters.
+ - Case arguments: `--cases(-c)`, `--log2p(-p)`, `--max_steps(-s)`, `--ftype(-ft)`. Respectively: Benchmark case file, case sizes, number of time steps, float data type. The following arguments would generate benchmarks for the [`tgv.jl`](./tgv.jl) case:
+    ```
+    -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
+    ```
+    which in addition to the benchmark arguments, altogether can be used to launch this script as:
+    ```
+    sh benchmark.sh -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6" -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
+    ```
+    Case arguments accept a list of parameters for each case, and the list index is shared across these arguments (hence lists must have equal length):
+    ```
+    -c "tgv.jl donut.jl" -p "5,6,7 7,8" -s "100 500" -ft "Float32 Float64"
+    ```
+    which would run the same benchmarks for the TGV as before, and benchmarks for the donut case too resulting into 2 Julia versions x (2 Array + 1 CuArray) backends x (3 TGV sizes + 2 donut sizes) = 30 benchmarks.
+
+Benchmarks are saved in JSON format with the following nomenclature: `casename_sizes_maxsteps_ftype_backend_waterlilyHEADhash_juliaversion.json`. Benchmarks can be finally compared using [`compare.jl`](./compare.jl) as follows
+```
+julia --project compare.jl benchmark_1.json benchmark_2.json benchmark_3.json ...
+```
+Note that each case benchmarks should be compared separately. If a single case is benchmarked, and all the JSON files in the current directory belong to it, one can simply run:
+```
+julia --project compare.jl $(find . -name "*.json" -printf "%T@ %Tc %p\n" | sort -n | awk '{print $8}')
+```
+which would take all the JSON files, sort them by creation time, and pass them as arguments to the `compare.jl` program. Finally, note that the first benchmark passed as argument is taken as reference to compute speedups of other benchmarks: `speedup_x = time(benchmark_1) / time(benchmark_x)`.
diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
index d5fca237..c36f5e4d 100644
--- a/benchmark/benchmark.sh
+++ b/benchmark/benchmark.sh
@@ -1,40 +1,4 @@
 #!/bin/bash
-# ---- Automatic benchmark generation script
-# Allows to generate benchmark across different julia versions, backends, cases, and cases sizes.
-# juliaup is required: https://github.com/JuliaLang/juliaup
-#
-# Accepted arguments are (parenthesis for short version):
-#   - Backend arguments: --version(-v), --backends(-b) --threads(-t) [Julia version, backend types, number of threads (for Array backend)]
-#     These arguments accept a list of different parameters, for example:
-#         -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6"
-#     which would generate benchmark for all these combinations of parameters.
-#   - Case arguments: --cases(-c), --log2p(-p), --max_steps(-s), --ftype(-ft) [Benchmark case file, case sizes, number of time steps, float data type]
-#     The following arguments would generate benchmarks for the "tgv.jl" case:
-#         -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
-#     which in addition to the benchmark arguments, altogether can be used to launch this script as:
-#         sh benchmark.sh -v "1.8.5 1.9.4" -b "Array CuArray" -t "1 6" -c "tgv.jl" -p "5,6,7" -s 100 -ft "Float32"
-#     Case arguments accept a list of parameters for each case, and the list index is shared across these arguments (hence lists must have equal length):
-#         -c "tgv.jl donut.jl" -p "5,6,7 7,8" -s "100 500" -ft "Float32 Float64"
-#     which would run the same benchmarks for the TGV as before, and benchmarks for the donut case too resulting into
-#         2 Julia versions x (2 Array + 1 CuArray) backends x (3 TGV sizes + 2 donut sizes) = 30 benchmarks
-#
-# Benchmarks are saved in JSON format with the following nomenclature:
-#     casename_sizes_maxsteps_ftype_backend_waterlilyHEADhash_juliaversion.json
-# Benchmarks can be finally compared using compare.jl as follows
-#     julia --project compare.jl benchmark_1.json benchmark_2.json benchmark_3.json ...
-# Note that each case benchmarks should be compared separately.
-# If a single case is benchmarked, and all the JSON files in the current directory belong to it, one can simply run:
-#     julia --project compare.jl $(find . -name "*.json" -printf "%T@ %Tc %p\n" | sort -n | awk '{print $8}')
-# which would take all the JSON files, sort them by creation time, and pass them as arguments to the compare.jl program.
-# Finally, note that the first benchmark passed as argument is taken as reference to compute speedups of other benchmarks:
-#     speedup_x = time(benchmark_1) / time(benchmark_x).
-#
-# TL;DR: Usage example
-#     sh benchmark.sh  -v "1.9.4 1.10.0-rc1" -t "1 3 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7"
-# The default launch is equivalent to:
-#     sh benchmark.sh  -v JULIA_DEFAULT -t "1 6" -b "Array CuArray" -c "tgv.jl" -p "5,6,7" -s 100 -ft Float32
-# ----
-
 
 # Grep current julia version
 julia_version () {
@@ -45,14 +9,13 @@ julia_version () {
 # Update project environment with new Julia version
 update_environment () {
     echo "Updating environment to Julia v$version"
-    juliaup default $version
     # Mark WaterLily as a development package. Then update dependencies and precompile.
-    julia --project -e "using Pkg; Pkg.develop(PackageSpec(path=join(split(pwd(), '/')[1:end-1], '/'))); Pkg.update();"
+    julia +${version} --project -e "using Pkg; Pkg.develop(PackageSpec(path=dirname(@__DIR__))); Pkg.update();"
 }
 
 run_benchmark () {
-    echo "Running: julia --project $args"
-    julia --project $args
+    echo "Running: julia +${version} --project --startup-file=no $args"
+    julia +${version} --project --startup-file=no $args
 }
 
 # Print benchamrks info
@@ -72,8 +35,8 @@ display_info () {
 }
 
 # Default backends
-DEFAULT_JULIA_VERSION=$(julia_version)
-VERSION=($DEFAULT_JULIA_VERSION)
+JULIA_USER_VERSION=$(julia_version)
+VERSION=($JULIA_USER_VERSION)
 BACKENDS=('Array' 'CuArray')
 THREADS=('1' '6')
 # Default cases. Arrays below must be same length (specify each case individually)
@@ -153,10 +116,5 @@ for version in "${VERSIONS[@]}" ; do
     done
 done
 
-# To compare all the benchmarks in this directory, run
-# julia --project compare.jl $(find . -name "*.json" -printf "%T@ %Tc %p\n" | sort -n | awk '{print $8}')
-
-# Restore julia system version to default one and exit
-juliaup default $DEFAULT_JULIA_VERSION
 echo "All done!"
 exit 0