From 991066d5f74029af1f1da407655870765c37e060 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <r.chisholm@sheffield.ac.uk>
Date: Wed, 26 Jul 2023 12:41:16 +0100
Subject: [PATCH 1/7] Distributed Ensembles (MPI)

Also clarify CUDAEnsemble::getLogs() now returns std::map
---
 .../running-multiple-simulations/index.rst    | 56 ++++++++++++++++---
 1 file changed, 47 insertions(+), 9 deletions(-)
diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 5aa8c8702..69b498581 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -140,7 +140,9 @@ Next you need to decide which data will be collected, as it is not possible to e
 
 A short example is shown below, however you should refer to the :ref:`previous chapter<Configuring Data to be Logged>` for the comprehensive guide.
 
-One benefit of using :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` to carry out experiments, is that the specific :class:`RunPlan<flamegpu::RunPlan>` data is included in each log file, allowing them to be automatically processed and used for reproducible research. However, this does not identify the particular version or build of your model. 
+One benefit of using :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` to carry out experiments, is that the specific :class:`RunPlan<flamegpu::RunPlan>` data is included in each log file, allowing them to be automatically processed and used for reproducible research. However, this does not identify the particular version or build of your model.
+
+If you wish to post-process the logs programmatically, then :func:`CUDAEnsemble::getLogs()<flamegpu::CUDAEnsemble::getLogs>` can be used to fetch a map of :class:`RunLog<flamegpu::RunLog>` where keys correspond to the index of successful runs within the input :class:`RunPlanVector<flamegpu::RunPlanVector>` (if a simulation run failed it will not have a log within the map).
 
  Agent data is logged according to agent state, so agents with multiple states must have the config specified for each state required to be logged.
 
@@ -163,8 +165,8 @@ One benefit of using :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` to carry out
     exit_log_cfg.logEnvironment("lerp_float");
     
     // Pass the logging configs to the CUDAEnsemble
-    cuda_ensemble.setStepLog(step_log_cfg);
-    cuda_ensemble.setExitLog(exit_log_cfg);
+    ensemble.setStepLog(step_log_cfg);
+    ensemble.setExitLog(exit_log_cfg);
 
   .. code-tab:: py Python
   
@@ -183,8 +185,8 @@ One benefit of using :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` to carry out
     exit_log_cfg.logEnvironment("lerp_float")
     
     # Pass the logging configs to the CUDAEnsemble
-    cuda_ensemble.setStepLog(step_log_cfg)
-    cuda_ensemble.setExitLog(exit_log_cfg)
+    ensemble.setStepLog(step_log_cfg)
+    ensemble.setExitLog(exit_log_cfg)
     
 Configuring & Running the Ensemble
 ----------------------------------
@@ -208,6 +210,7 @@ Long Argument                  Short Argument              Description
                                                            By default the :enum:`ErrorLevel<flamegpu::CUDAEnsemble::EnsembleConfig::ErrorLevel>` will be set to "slow" (1).
 ``--standby``                                              Allow the operating system to enter standby during ensemble execution.
                                                            The standby blocking feature is currently only supported on Windows, where it is enabled by default.
+``--no-mpi``                                               Do not use MPI (only available when built with ``FLAMEGPU_ENABLE_MPI`` at CMake configuration time).
 ============================== =========================== ========================================================
 
 You may also wish to specify your own defaults, by setting the values prior to calling :func:`initialise()<flamegpu::CUDAEnsemble::initialise>`:
@@ -235,11 +238,18 @@ You may also wish to specify your own defaults, by setting the values prior to c
     ensemble.initialise(argc, argv);
     
     // Pass the logging configs to the CUDAEnsemble
-    cuda_ensemble.setStepLog(step_log_cfg);
-    cuda_ensemble.setExitLog(exit_log_cfg);
+    ensemble.setStepLog(step_log_cfg);
+    ensemble.setExitLog(exit_log_cfg);
     
     // Execute the ensemble using the specified RunPlans
     const unsigned int errs = ensemble.simulate(runs);
+    
+    // Fetch the RunLogs of successful runs
+    const std::map<unsigned int, flamegpu::RunLog> &logs = ensemble.getLogs();
+    for (const auto &[plan_id, log] : logs) {
+        // Post-process the logs
+        ...
+    }
 
   .. code-tab:: py Python
     
@@ -262,12 +272,19 @@ You may also wish to specify your own defaults, by setting the values prior to c
     ensemble.initialise(sys.argv)
     
     # Pass the logging configs to the CUDAEnsemble
-    cuda_ensemble.setStepLog(step_log_cfg)
-    cuda_ensemble.setExitLog(exit_log_cfg)
+    ensemble.setStepLog(step_log_cfg)
+    ensemble.setExitLog(exit_log_cfg)
     
     # Execute the ensemble using the specified RunPlans
     errs = ensemble.simulate(runs)
     
+    # Fetch the RunLogs of successful runs
+    logs = ensemble.getLogs()
+    for plan_id, log in logs.items():
+        # Post-process the logs
+        ...
+
+    
 Error Handling Within Ensembles
 -------------------------------
 
@@ -285,6 +302,27 @@ The default error level is "Slow" (1), which will cause an exception to be raise
 
 Alternatively, calls to :func:`simulate()<flamegpu::CUDAEnsemble::simulate>` return the number of errors, when the error level is set to "Off" (0). Therefore, failed runs can be probed manually via checking that the return value of :func:`simulate()<flamegpu::CUDAEnsemble::simulate>` does not equal zero.
 
+Distributed Ensembles via MPI
+-----------------------------
+
+For particularly expensive batch runs you may wish to distribute the workload across multiple nodes. This can be achieved via Message Passing Interface (MPI) support.
+
+To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>`  will use MPI by default when the MPI world size exceeds 1. This can be overridden by passing ``--no-mpi`` at runtime or setting the ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` to ``false``.
+
+When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes.
+
+The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will both initialise and finalise the MPI state, as such it can only be called once.
+
+All three error-levels are supported and behave similarly. In all cases the rank 0 instance will be the only instance to raise an exception after the MPI group exits cleanly.
+
+If programmatically accessing run logs when using MPI, via :func:`CUDAEnsemble::getLogs()<flamegpu::CUDAEnsemble::getLogs>`, each MPI instance will return the logs for the runs it personally completed. This enables further post-processing to remain distributed.
+
+For more guidance around using MPI, such as how to launch MPI jobs, you should refer to the documentation for the HPC system you will be using.
+
+.. warning::
+
+  :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support assumes that each instance has exclusive access to all visible GPUs. Non-exclusive GPU access is likely to lead to overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI instance per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node.
+
   
 Related Links
 -------------

From fb9e548607d5d2e281dc8cf31585821a6755ada5 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <r.chisholm@sheffield.ac.uk>
Date: Fri, 28 Jul 2023 13:37:14 +0100
Subject: [PATCH 2/7] Address cleanup() and cuda-aware MPI

---
 src/guide/running-multiple-simulations/index.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 69b498581..fa6a2c12e 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -250,6 +250,9 @@ You may also wish to specify your own defaults, by setting the values prior to c
         // Post-process the logs
         ...
     }
+    
+    // Ensure profiling / memcheck work correctly (and trigger MPI_Finalize())
+    flamegpu::util::cleanup();
 
   .. code-tab:: py Python
     
@@ -284,6 +287,8 @@ You may also wish to specify your own defaults, by setting the values prior to c
         # Post-process the logs
         ...
 
+    # Ensure profiling / memcheck work correctly (and trigger MPI_Finalize())
+    pyflamegpu.cleanup();
     
 Error Handling Within Ensembles
 -------------------------------
@@ -309,6 +314,8 @@ For particularly expensive batch runs you may wish to distribute the workload ac
 
 To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>`  will use MPI by default when the MPI world size exceeds 1. This can be overridden by passing ``--no-mpi`` at runtime or setting the ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` to ``false``.
 
+It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default. Hence it's only necessary to launch 1 runner per node.
+
 When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes.
 
 The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will both initialise and finalise the MPI state, as such it can only be called once.
@@ -322,6 +329,10 @@ For more guidance around using MPI, such as how to launch MPI jobs, you should r
 .. warning::
 
   :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support assumes that each instance has exclusive access to all visible GPUs. Non-exclusive GPU access is likely to lead to overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI instance per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node.
+  
+.. warning::
+
+  :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup()>` must be called before the program returns when using MPI, this triggers ``MPI_Finalize()``.
 
   
 Related Links

From 3ea61c20213fc83cf3ba811c1ae414d75df2dbee Mon Sep 17 00:00:00 2001
From: Robert Chisholm <r.chisholm@sheffield.ac.uk>
Date: Wed, 13 Sep 2023 10:38:49 +0100
Subject: [PATCH 3/7] Docs additions based on Pete's feedback.

---
 src/guide/running-multiple-simulations/index.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index fa6a2c12e..69c6ea5bf 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -310,15 +310,15 @@ Alternatively, calls to :func:`simulate()<flamegpu::CUDAEnsemble::simulate>` ret
 Distributed Ensembles via MPI
 -----------------------------
 
-For particularly expensive batch runs you may wish to distribute the workload across multiple nodes. This can be achieved via Message Passing Interface (MPI) support.
+For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support.
 
 To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>`  will use MPI by default when the MPI world size exceeds 1. This can be overridden by passing ``--no-mpi`` at runtime or setting the ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` to ``false``.
 
-It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default. Hence it's only necessary to launch 1 runner per node.
+It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms). Hence it's only necessary to launch 1 runner per node, although multiple CPU threads are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
 
 When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes.
 
-The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will both initialise and finalise the MPI state, as such it can only be called once.
+The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will initialise MPI state if this has necessary, in order to cleanly exit :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup>` must be called before the program exits. Hence, you may call :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` multiple times to execute multiple ensembles via MPI in a single execution,  or probe the MPI world state prior to launching the ensemble.
 
 All three error-levels are supported and behave similarly. In all cases the rank 0 instance will be the only instance to raise an exception after the MPI group exits cleanly.
 

From 94a4fc73fb4b7c9ea96c6f6d94b80305fbfe03e3 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <r.chisholm@sheffield.ac.uk>
Date: Fri, 22 Sep 2023 15:10:18 +0100
Subject: [PATCH 4/7] Remove references to --no-mpi

---
 src/guide/running-multiple-simulations/index.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 69c6ea5bf..139ceb8f6 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -210,7 +210,6 @@ Long Argument                  Short Argument              Description
                                                            By default the :enum:`ErrorLevel<flamegpu::CUDAEnsemble::EnsembleConfig::ErrorLevel>` will be set to "slow" (1).
 ``--standby``                                              Allow the operating system to enter standby during ensemble execution.
                                                            The standby blocking feature is currently only supported on Windows, where it is enabled by default.
-``--no-mpi``                                               Do not use MPI (only available when built with ``FLAMEGPU_ENABLE_MPI`` at CMake configuration time).
 ============================== =========================== ========================================================
 
 You may also wish to specify your own defaults, by setting the values prior to calling :func:`initialise()<flamegpu::CUDAEnsemble::initialise>`:
@@ -312,7 +311,7 @@ Distributed Ensembles via MPI
 
 For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support.
 
-To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>`  will use MPI by default when the MPI world size exceeds 1. This can be overridden by passing ``--no-mpi`` at runtime or setting the ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` to ``false``.
+To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will use MPI by default. The ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` which will be set ``true`` if MPI support was enabled at compile time.
 
 It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms). Hence it's only necessary to launch 1 runner per node, although multiple CPU threads are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
 

From 19f99c0e6de5dc19864334af18c9a098db895ae2 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Tue, 7 Nov 2023 15:01:28 +0000
Subject: [PATCH 5/7] Add examples of how to call diff MPI implementations with
 1 process per node

---
 src/guide/running-multiple-simulations/index.rst | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 139ceb8f6..9cd3aafcf 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -313,7 +313,19 @@ For particularly expensive batch runs you may wish to distribute the workload ac
 
 To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will use MPI by default. The ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` which will be set ``true`` if MPI support was enabled at compile time.
 
-It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms). Hence it's only necessary to launch 1 runner per node, although multiple CPU threads are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
+It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms). Hence it's only necessary to launch 1 process per node, although multiple CPU threads are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
+
+.. note::
+
+  MPI implementations differ in how to request 1 process per node when calling MPI. A few examples are provided below:
+  
+  * `Open MPI`_: ``mpirun --pernode`` or ``mpirun --npernode 1``
+  * `MVAPICH2`_: ``mpirun_rsh -ppn 1``
+  * `Bede`_: ``bede-mpirun --bede-par 1ppn``
+
+.. _Open MPI: https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php
+.. _MVAPICH2: https://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-userguide.html#x1-320005.2.1
+.. _Bede: https://bede-documentation.readthedocs.io/en/latest/usage/index.html?#multiple-nodes-mpi
 
 When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes.
 

From 0560a622baebff28755beb44401d7a723618f40b Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Tue, 7 Nov 2023 15:11:06 +0000
Subject: [PATCH 6/7] Clarify tests_mpi and Python support.

---
 src/guide/running-multiple-simulations/index.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 9cd3aafcf..8bf5319b2 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -309,7 +309,7 @@ Alternatively, calls to :func:`simulate()<flamegpu::CUDAEnsemble::simulate>` ret
 Distributed Ensembles via MPI
 -----------------------------
 
-For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support.
+For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support. This feature is supported by both the C++ and Python interfaces to FLAMEGPU, however is not available in pre-built binaries/packages/wheels and must be compiled from source as required.
 
 To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will use MPI by default. The ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` which will be set ``true`` if MPI support was enabled at compile time.
 
@@ -331,19 +331,21 @@ When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will exec
 
 The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will initialise MPI state if this has necessary, in order to cleanly exit :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup>` must be called before the program exits. Hence, you may call :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` multiple times to execute multiple ensembles via MPI in a single execution,  or probe the MPI world state prior to launching the ensemble.
 
-All three error-levels are supported and behave similarly. In all cases the rank 0 instance will be the only instance to raise an exception after the MPI group exits cleanly.
+All three error-levels are supported and behave similarly. In all cases the rank 0 process will be the only process to raise an exception after the MPI group exits cleanly.
 
-If programmatically accessing run logs when using MPI, via :func:`CUDAEnsemble::getLogs()<flamegpu::CUDAEnsemble::getLogs>`, each MPI instance will return the logs for the runs it personally completed. This enables further post-processing to remain distributed.
+If programmatically accessing run logs when using MPI, via :func:`CUDAEnsemble::getLogs()<flamegpu::CUDAEnsemble::getLogs>`, each MPI process will return the logs for the runs it personally completed. This enables further post-processing to remain distributed.
 
 For more guidance around using MPI, such as how to launch MPI jobs, you should refer to the documentation for the HPC system you will be using.
 
 .. warning::
 
-  :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support assumes that each instance has exclusive access to all visible GPUs. Non-exclusive GPU access is likely to lead to overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI instance per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node.
+  :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support assumes that each process has exclusive access to all visible GPUs. Non-exclusive GPU access is likely to lead to overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI process per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node.
   
 .. warning::
 
   :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup()>` must be called before the program returns when using MPI, this triggers ``MPI_Finalize()``.
+  
+FLAMEGPU has a dedicated MPI test suite, this can be built and ran via the ``tests_mpi`` CMake target. It is configured to automatically divide GPUs between MPI processes when executed with MPI on a single node (e.g. ``mpirun -n 2 ./tests_mpi``) and scale across any multi-node configuration. Due to limitations with GoogleTest each runner will execute tests and print to stdout/stderr, crashes during a test may cause the suite to deadlock.
 
   
 Related Links

From ffe7a1b61b754d892855f928b4f6f8881c499d05 Mon Sep 17 00:00:00 2001
From: Peter Heywood <peethwd@gmail.com>
Date: Thu, 14 Dec 2023 11:46:52 +0000
Subject: [PATCH 7/7] Update MPI ensemble documentation to reflect changes

---
 .../running-multiple-simulations/index.rst    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/guide/running-multiple-simulations/index.rst b/src/guide/running-multiple-simulations/index.rst
index 8bf5319b2..616ec9938 100644
--- a/src/guide/running-multiple-simulations/index.rst
+++ b/src/guide/running-multiple-simulations/index.rst
@@ -309,11 +309,15 @@ Alternatively, calls to :func:`simulate()<flamegpu::CUDAEnsemble::simulate>` ret
 Distributed Ensembles via MPI
 -----------------------------
 
-For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support. This feature is supported by both the C++ and Python interfaces to FLAMEGPU, however is not available in pre-built binaries/packages/wheels and must be compiled from source as required.
+For particularly expensive batch runs you may wish to distribute the workload across multiple nodes within a HPC cluster. This can be achieved via Message Passing Interface (MPI) support. This feature is supported by both the C++ and Python interfaces to FLAMEGPU, however it is not available in pre-built binaries/packages/wheels and must be compiled from source as required.
 
-To enable MPI support FLAMEGPU should be compiled with the CMake flag ``FLAMEGPU_ENABLE_MPI``. When compiled with this flag :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will use MPI by default. The ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` which will be set ``true`` if MPI support was enabled at compile time.
+To enable MPI support FLAMEGPU should be configured with the CMake flag ``FLAMEGPU_ENABLE_MPI`` enabled. When compiled with this flag :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will use MPI. The ``mpi`` member of the :class:`CUDAEnsemble::EnsembleConfig<flamegpu::CUDAEnsemble::EnsembleConfig>` which will be set ``true`` if MPI support was enabled at compile time.
 
-It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms). Hence it's only necessary to launch 1 process per node, although multiple CPU threads are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
+It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::CUDAEnsemble>` will make use of all available GPUs by default using the it's existing multi-gpu support (as opposed to GPU direct MPI comms).
+Hence it's only necessary to launch 1 process per node, although requesting multiple CPU cores in a HPC environment are still recommended (e.g. a minimum of ``N+1``, where ``N`` is the number of GPUs in the node).
+
+If more than one MPI process is launched per node, the available GPUs will be load-balanced across the MPI ranks.
+If more MPI processes are launched per node than there are GPUs available, a warning will be issued as the additional MPI ranks will not participate in execution of the ensemble as they are unnecessary.
 
 .. note::
 
@@ -327,9 +331,9 @@ It is not necessary to use a CUDA aware MPI library, as `CUDAEnsemble<flamegpu::
 .. _MVAPICH2: https://mvapich.cse.ohio-state.edu/static/media/mvapich/mvapich2-userguide.html#x1-320005.2.1
 .. _Bede: https://bede-documentation.readthedocs.io/en/latest/usage/index.html?#multiple-nodes-mpi
 
-When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes.
+When executing with MPI, :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` will execute the input :class:`RunPlanVector<flamegpu::RunPlanVector>` across all available GPUs and concurrent runs, automatically assigning jobs when a runner becomes free. This should achieve better load balancing than manually dividing work across nodes, but may lead to increased HPC queue times as the nodes must be available concurrently.
 
-The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will initialise MPI state if this has necessary, in order to cleanly exit :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup>` must be called before the program exits. Hence, you may call :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` multiple times to execute multiple ensembles via MPI in a single execution,  or probe the MPI world state prior to launching the ensemble.
+The call to :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` will initialise MPI state if this has necessary, in order to cleanly exit :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup>` must be called before the program exits. Hence, you may call :func:`CUDAEnsemble::simulate()<flamegpu::CUDAEnsemble::simulate>` multiple times to execute multiple ensembles via MPI in a single execution, or probe the MPI world state prior to launching the ensemble, but :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup>` must only be called once.
 
 All three error-levels are supported and behave similarly. In all cases the rank 0 process will be the only process to raise an exception after the MPI group exits cleanly.
 
@@ -339,13 +343,13 @@ For more guidance around using MPI, such as how to launch MPI jobs, you should r
 
 .. warning::
 
-  :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support assumes that each process has exclusive access to all visible GPUs. Non-exclusive GPU access is likely to lead to overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI process per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node.
+  :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` MPI support distributes GPUs within a shared memory system (node) across the MPI ranks assigned to that node, to avoid overallocation of resources and unnecessary model failures. It's only necessary to launch 1 MPI process per node, as :class:`CUDAEnsemble<flamegpu::CUDAEnsemble>` is natively able to utilise multiple GPUs within a single node, and a warning will be emitted if more MPI ranks are assigned to a node than there are visible GPUs.
   
 .. warning::
 
-  :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup()>` must be called before the program returns when using MPI, this triggers ``MPI_Finalize()``.
+  :func:`flamegpu::util::cleanup()<flamegpu::util::cleanup()>` must be called before the program returns when using MPI, this triggers ``MPI_Finalize()``. It must only be called once per process.
   
-FLAMEGPU has a dedicated MPI test suite, this can be built and ran via the ``tests_mpi`` CMake target. It is configured to automatically divide GPUs between MPI processes when executed with MPI on a single node (e.g. ``mpirun -n 2 ./tests_mpi``) and scale across any multi-node configuration. Due to limitations with GoogleTest each runner will execute tests and print to stdout/stderr, crashes during a test may cause the suite to deadlock.
+FLAMEGPU has a dedicated MPI test suite, this can be built and ran via the ``tests_mpi`` CMake target. It is configured to automatically divide GPUs between MPI processes when executed with MPI on a single node (e.g. ``mpirun -n 2 ./tests_mpi``) and scale across any multi-node configuration. Some tests will not run if only a single GPU (and therefore MPI rank) is available. Due to limitations with GoogleTest each runner will execute tests and print to stdout/stderr, crashes during a test may cause the suite to deadlock.
 
   
 Related Links