diff --git a/.github/workflows/MPI.yml b/.github/workflows/MPI.yml
new file mode 100644
index 000000000..70c6122f2
--- /dev/null
+++ b/.github/workflows/MPI.yml
@@ -0,0 +1,261 @@
+# Perform builds with supported MPI versions
+name: MPI
+
+on:
+  # On pull_requests which mutate this CI workflow, using pr rather than push:branch to ensure that the merged state will be OK, as that is what is important here.
+  pull_request:
+    paths:
+      - ".github/workflows/MPI.yml"
+      - "tests/test_cases/simulation/test_mpi_ensemble.cu"
+      - "include/flamegpu/simulation/detail/MPISimRunner.h"
+      - "include/flamegpu/simulation/detail/AbstractSimRunner.h"
+      - "src/flamegpu/simulation/detail/MPISimRunner.cu"
+      - "src/flamegpu/simulation/detail/AbstractSimRunner.cu"
+      - "src/flamegpu/simulation/CUDAEnsemble.cu"
+      - "**mpi**"
+      - "**MPI**"
+  # Or trigger on manual dispatch. 
+  workflow_dispatch:
+
+defaults:
+  run:
+    # Default to using bash regardless of OS unless otherwise specified.
+    shell: bash
+
+jobs:
+  build-ubuntu:
+    runs-on: ${{ matrix.cudacxx.os }}
+    strategy:
+      fail-fast: false
+      # Multiplicative build matrix
+      # optional exclude: can be partial, include: must be specific
+      matrix:
+        # CUDA_ARCH values are reduced compared to wheels due to CI memory issues while compiling the test suite.
+        cudacxx:
+          - cuda: "12.0"
+            cuda_arch: "50-real;"
+            hostcxx: gcc-11
+            os: ubuntu-22.04
+        python:
+          - "3.8"
+        mpi:
+          - lib: "openmpi"
+            version: "apt" # MPI 3.1
+          # - lib: "openmpi"
+          #   version: "4.1.6" # MPI 3.1
+          # - lib: "openmpi"
+          #   version: "4.0.0" # MPI 3.1
+          # - lib: "openmpi"
+          #   version: "3.0.0" # MPI 3.1
+          # - lib: "openmpi"
+          #   version: "2.0.0" # MPI 3.1
+          - lib: "openmpi"
+            version: "1.10.7" # MPI 3.0
+          - lib: "mpich"
+            version: "apt" # MPI 4.0
+          # - lib: "mpich"
+          #   version: "4.1.2" # MPI 4.0
+          # - lib: "mpich"
+          #   version: "4.0" # MPI 4.0
+          # - lib: "mpich"
+          #   version: "3.4.3" # MPI 3.1
+          - lib: "mpich"
+            version: "3.3" # MPI 3.1
+        config:
+          - name: "Release"
+            config: "Release"
+            SEATBELTS: "ON"
+        VISUALISATION: 
+          - "OFF"
+
+    # Name the job based on matrix/env options
+    name: "build-ubuntu-mpi (${{ matrix.mpi.lib }}, ${{ matrix.mpi.version }}, ${{ matrix.cudacxx.cuda }}, ${{matrix.python}}, ${{ matrix.VISUALISATION }}, ${{ matrix.config.name }}, ${{ matrix.cudacxx.os }})"
+
+    # Define job-wide env constants, and promote matrix elements to env constants for portable steps.
+    env:
+      # Define constants
+      BUILD_DIR: "build"
+      FLAMEGPU_BUILD_TESTS: "ON"
+      # Conditional based on matrix via awkward almost ternary
+      FLAMEGPU_BUILD_PYTHON: ${{ fromJSON('{true:"ON",false:"OFF"}')[matrix.python != ''] }}
+      # Port matrix options to environment, for more portability.
+      CUDA: ${{ matrix.cudacxx.cuda }}
+      CUDA_ARCH: ${{ matrix.cudacxx.cuda_arch }}
+      HOSTCXX: ${{ matrix.cudacxx.hostcxx }}
+      OS: ${{ matrix.cudacxx.os }}
+      CONFIG: ${{ matrix.config.config }}
+      FLAMEGPU_SEATBELTS: ${{ matrix.config.SEATBELTS }}
+      PYTHON: ${{ matrix.python }}
+      MPI_LIB: ${{ matrix.mpi.lib }}
+      MPI_VERSION: ${{ matrix.mpi.version }}
+      VISUALISATION: ${{ matrix.VISUALISATION }}
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install CUDA
+      if: ${{ startswith(env.OS, 'ubuntu') && env.CUDA != '' }}
+      env:
+        cuda: ${{ env.CUDA }}
+      run: .github/scripts/install_cuda_ubuntu.sh
+
+    - name: Install/Select gcc and g++
+      if: ${{ startsWith(env.HOSTCXX, 'gcc-') }}
+      run: |
+        gcc_version=${HOSTCXX//gcc-/}
+        sudo apt-get install -y gcc-${gcc_version} g++-${gcc_version}
+        echo "CC=/usr/bin/gcc-${gcc_version}" >> $GITHUB_ENV
+        echo "CXX=/usr/bin/g++-${gcc_version}" >> $GITHUB_ENV
+        echo "CUDAHOSTCXX=/usr/bin/g++-${gcc_version}" >> $GITHUB_ENV
+    
+    - name: Install MPI from apt
+      if: ${{ env.MPI_VERSION == 'apt' }}
+      working-directory: ${{ runner.temp }}
+      run: |
+        sudo apt-get install lib${{ env.MPI_LIB }}-dev
+
+    - name: Install OpenMPI from source
+      if: ${{ env.MPI_VERSION != 'apt' && env.MPI_LIB == 'openmpi' }}
+      working-directory: ${{ runner.temp }}
+      run: |
+        # Note: using download.open-mpi.org as gh tags aren't pre configured
+        MPI_VERISON_MAJOR_MINOR=$(cut -d '.' -f 1,2 <<< "${{ env.MPI_VERSION}}")
+        echo "https://download.open-mpi.org/release/open-mpi/v${MPI_VERISON_MAJOR_MINOR}/openmpi-${{ env.MPI_VERSION}}.tar.gz"
+        wget -q https://download.open-mpi.org/release/open-mpi/v${MPI_VERISON_MAJOR_MINOR}/openmpi-${{ env.MPI_VERSION}}.tar.gz --output-document openmpi-${{ env.MPI_VERSION }}.tar.gz || (echo "An Error occurred while downloading OpenMPI '${{ env.MPI_VERSION }}'. Is it a valid version of OpenMPI?" && exit 1)
+        tar -zxvf openmpi-${{ env.MPI_VERSION }}.tar.gz
+        cd openmpi-${{ env.MPI_VERSION}}
+        ./configure --prefix="${{ runner.temp }}/mpi"
+        make -j `nproc`
+        make install -j `nproc`
+        echo "${{ runner.temp }}/mpi/bin" >> $GITHUB_PATH
+        echo "LD_LIBRARY_PATH=${{ runner.temp }}/mpi/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+        echo "LD_RUN_PATH=${{ runner.temp }}/mpi/lib:${LD_RUN_PATH}" >> $GITHUB_ENV
+
+    # This will only work for mpich >= 3.3:
+    #   3.0-3.2 doesn't appear compatible with default gcc in 22.04. 
+    #   1.x is named mpich2 so requires handling differently
+    # Uses the ch3 interface, as ch4 isn't available pre 3.4, but one must be specified for some versions
+    - name: Install MPICH from source
+      if: ${{ env.MPI_VERSION != 'apt' && env.MPI_LIB == 'mpich' }}
+      working-directory: ${{ runner.temp }}
+      run: |
+        MPI_MAJOR=$(cut -d '.' -f 1 <<< "${{ env.MPI_VERSION}}")
+        MPI_MINOR=$(cut -d '.' -f 2 <<< "${{ env.MPI_VERSION}}")
+        [[ ${MPI_MAJOR} < 3 ]] && echo "MPICH must be >= 3.0" && exit 1
+        echo "https://www.mpich.org/static/downloads/${{ env.MPI_VERSION }}/mpich-${{ env.MPI_VERSION}}.tar.gz"
+        wget -q https://www.mpich.org/static/downloads/${{ env.MPI_VERSION }}/mpich-${{ env.MPI_VERSION}}.tar.gz --output-document mpich-${{ env.MPI_VERSION }}.tar.gz || (echo "An Error occurred while downloading MPICH '${{ env.MPI_VERSION }}'. Is it a valid version of MPICH?" && exit 1)
+        tar -zxvf mpich-${{ env.MPI_VERSION }}.tar.gz
+        cd mpich-${{ env.MPI_VERSION}}
+        DISABLE_FORTRAN_FLAGS=""
+        if (( ${MPI_MAJOR} >= 4 )) || ( ((${MPI_MAJOR} >= 3)) && ((${MPI_MINOR} >= 2)) ); then
+            # MPICH >= 3.2 has --disable-fortran
+            DISABLE_FORTRAN_FLAGS="--disable-fortran" 
+        else
+            DISABLE_FORTRAN_FLAGS="--disable-f77 --disable-fc"
+        fi
+        ./configure --prefix="${{ runner.temp }}/mpi" --with-device=ch3 ${DISABLE_FORTRAN_FLAGS}
+        make -j `nproc`
+        make install -j `nproc`
+        echo "${{ runner.temp }}/mpi/bin" >> $GITHUB_PATH
+        echo "LD_LIBRARY_PATH=${{ runner.temp }}/mpi/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+        echo "LD_RUN_PATH=${{ runner.temp }}/mpi/lib:${LD_RUN_PATH}" >> $GITHUB_ENV
+    
+    - name: Select Python
+      if: ${{ env.PYTHON != '' && env.FLAMEGPU_BUILD_PYTHON == 'ON' }} 
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON }}
+
+    - name: Install python dependencies
+      if: ${{ env.PYTHON != '' && env.FLAMEGPU_BUILD_PYTHON == 'ON' }}
+      run: |
+        sudo apt-get install python3-venv
+        python3 -m pip install --upgrade wheel build setuptools
+
+    - name: Install Visualisation Dependencies
+      if: ${{ startswith(env.OS, 'ubuntu') && env.VISUALISATION == 'ON' }}
+      run: |
+        # Install ubuntu-20.04 packages
+        if [ "$OS" == 'ubuntu-20.04' ]; then 
+          sudo apt-get install -y libglew-dev libfontconfig1-dev libsdl2-dev libdevil-dev libfreetype-dev
+        fi
+        # Install Ubuntu 18.04 packages
+        if [ "$OS" == 'ubuntu-18.04' ]; then 
+          sudo apt-get install -y libglew-dev libfontconfig1-dev libsdl2-dev libdevil-dev libfreetype6-dev libgl1-mesa-dev
+        fi
+    
+    - name: Install Swig >= 4.0.2
+      run: |
+        # Remove existing swig install, so CMake finds the correct swig
+        if [ "$OS" == 'ubuntu-20.04' ]; then 
+          sudo apt-get remove -y swig swig4.0
+        fi
+        # Install Ubuntu 18.04 packages
+        if [ "$OS" == 'ubuntu-18.04' ]; then 
+          sudo apt-get remove -y swig
+        fi
+        # Install additional apt-based dependencies required to build swig 4.0.2
+        sudo apt-get install -y bison
+        # Create a local directory to build swig in.
+        mkdir -p swig-from-source && cd swig-from-source
+        # Install SWIG building from source dependencies
+        wget https://github.com/swig/swig/archive/refs/tags/v4.0.2.tar.gz
+        tar -zxf v4.0.2.tar.gz
+        cd swig-4.0.2/
+        ./autogen.sh
+        ./configure
+        make
+        sudo make install
+
+    # This pre-emptively patches a bug where ManyLinux didn't generate buildnumber as git dir was owned by diff user
+    - name: Enable git safe-directory
+      run: git config --global --add safe.directory $GITHUB_WORKSPACE
+
+    - name: Configure cmake
+      run: >
+        cmake . -B "${{ env.BUILD_DIR }}"
+        -DCMAKE_BUILD_TYPE="${{ env.CONFIG }}"
+        -Werror=dev
+        -DCMAKE_WARN_DEPRECATED="OFF" 
+        -DFLAMEGPU_WARNINGS_AS_ERRORS="ON"
+        -DCMAKE_CUDA_ARCHITECTURES="${{ env.CUDA_ARCH }}"
+        -DFLAMEGPU_BUILD_TESTS="${{ env.FLAMEGPU_BUILD_TESTS }}"
+        -DFLAMEGPU_BUILD_PYTHON="${{ env.FLAMEGPU_BUILD_PYTHON }}"
+        -DPYTHON3_EXACT_VERSION="${{ env.PYTHON }}"
+        -DFLAMEGPU_VISUALISATION="${{ env.VISUALISATION }}"
+        -DFLAMEGPU_ENABLE_MPI="ON"
+        -DFLAMEGPU_ENABLE_NVTX="ON"
+        ${MPI_OVERRIDE_CXX_OPTIONS}
+
+    - name: Reconfigure cmake fixing MPICH from apt
+      if: ${{ env.MPI_VERSION == 'apt' && env.MPI_LIB == 'mpich' }}
+      run: >
+        cmake . -B "${{ env.BUILD_DIR }}"
+        -DMPI_CXX_COMPILE_OPTIONS=""
+
+    - name: Build static library
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target flamegpu --verbose -j `nproc`
+
+    - name: Build python wheel
+      if: ${{ env.FLAMEGPU_BUILD_PYTHON == 'ON' }}
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target pyflamegpu --verbose -j `nproc`
+
+    - name: Build tests
+      if: ${{ env.FLAMEGPU_BUILD_TESTS == 'ON' }}
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target tests --verbose -j `nproc`
+
+    - name: Build tests_mpi
+      if: ${{ env.FLAMEGPU_BUILD_TESTS == 'ON' }}
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target tests_mpi --verbose -j `nproc`
+
+    - name: Build ensemble example
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target ensemble --verbose -j `nproc`
+
+    - name: Build all remaining targets
+      working-directory: ${{ env.BUILD_DIR }}
+      run: cmake --build . --target all --verbose -j `nproc`
diff --git a/README.md b/README.md
index af15ffc12..20f769ec7 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,9 @@ Optionally:
   + With `setuptools`, `wheel`, `build` and optionally `venv` python packages installed
 + [swig](http://www.swig.org/) `>= 4.0.2` for python integration
   + Swig `4.x` will be automatically downloaded by CMake if not provided (if possible).
++ MPI (e.g. [MPICH](https://www.mpich.org/), [OpenMPI](https://www.open-mpi.org/)) for distributed ensemble support
+  + MPI 3.0+ tested, older MPIs may work but not tested.
+  + CMake `>= 3.20.1` may be required for some MPI libraries / platforms.
 + [FLAMEGPU2-visualiser](https://github.com/FLAMEGPU/FLAMEGPU2-visualiser) dependencies
   + [SDL](https://www.libsdl.org/)
   + [GLM](http://glm.g-truc.net/) *(consistent C++/GLSL vector maths functionality)*
@@ -176,6 +179,7 @@ cmake --build . --target all
 | `FLAMEGPU_VERBOSE_PTXAS`             | `ON`/`OFF`                  | Enable verbose PTXAS output during compilation. Default `OFF`.                                             |
 | `FLAMEGPU_CURAND_ENGINE`             | `XORWOW` / `PHILOX` / `MRG` | Select the CUDA random engine. Default `XORWOW`                                                            |
 | `FLAMEGPU_ENABLE_GLM`                | `ON`/`OFF`                  | Experimental feature for GLM type support within models. Default `OFF`.                                    |
+| `FLAMEGPU_ENABLE_MPI`                | `ON`/`OFF`                  | Enable MPI support for distributed CUDAEnsembles, each MPI worker should have exclusive access to it's GPUs e.g. 1 MPI worker per node. Default `OFF`.                                           |
 | `FLAMEGPU_ENABLE_ADVANCED_API`       | `ON`/`OFF`                  | Enable advanced API functionality (C++ only), providing access to internal sim components for high-performance extensions. No stability guarantees are provided around this interface and the returned objects.  Documentation is limited to that found in the source. Default `OFF`. |
 | `FLAMEGPU_SHARE_USAGE_STATISTICS`    | `ON`/`OFF`                  | Share usage statistics ([telemetry](https://docs.flamegpu.com/guide/telemetry)) to support evidencing usage/impact of the software. Default `ON`. |
 | `FLAMEGPU_TELEMETRY_SUPPRESS_NOTICE` | `ON`/`OFF`                  | Suppress notice encouraging telemetry to be enabled, which is emitted once per binary execution if telemetry is disabled. Defaults to `OFF`, or the value of a system environment variable of the same name. |
diff --git a/examples/cpp/ensemble/src/main.cu b/examples/cpp/ensemble/src/main.cu
index df87d3a43..528891426 100644
--- a/examples/cpp/ensemble/src/main.cu
+++ b/examples/cpp/ensemble/src/main.cu
@@ -17,8 +17,7 @@ FLAMEGPU_INIT_FUNCTION(Init) {
 std::atomic<unsigned int> atomic_init = {0};
 std::atomic<uint64_t> atomic_result = {0};
 FLAMEGPU_EXIT_FUNCTION(Exit) {
-    atomic_init += FLAMEGPU->environment.getProperty<int>("init");
-    atomic_result += FLAMEGPU->agent("Agent").sum<int>("x");
+    FLAMEGPU->environment.setProperty<int>("result", FLAMEGPU->agent("Agent").sum<int>("x"));
 }
 int main(int argc, const char ** argv) {
     flamegpu::ModelDescription model("boids_spatial3D");
@@ -35,6 +34,7 @@ int main(int argc, const char ** argv) {
         env.newProperty<int>("init", 0);
         env.newProperty<int>("init_offset", 0);
         env.newProperty<int>("offset", 1);
+        env.newProperty<int>("result", 0);
     }
     {   // Agent
         flamegpu::AgentDescription  agent = model.newAgent("Agent");
@@ -64,27 +64,46 @@ int main(int argc, const char ** argv) {
         runs.setPropertyLerpRange<int>("init_offset", 1, 0);
         runs.setPropertyLerpRange<int>("offset", 0, 99);
     }
-
+    /**
+     * Create a logging config
+     */
+    flamegpu::LoggingConfig exit_log_cfg(model);
+    exit_log_cfg.logEnvironment("init");
+    exit_log_cfg.logEnvironment("result");
     /**
      * Create Model Runner
      */
     flamegpu::CUDAEnsemble cuda_ensemble(model, argc, argv);
-
+    cuda_ensemble.setExitLog(exit_log_cfg);
     cuda_ensemble.simulate(runs);
 
-    // Check result
-    // Don't currently have logging
-    unsigned int init_sum = 0;
-    uint64_t result_sum = 0;
-    for (int i = 0 ; i < 100; ++i) {
-        const int init = i/10;
-        const int init_offset = 1 - i/50;
-        init_sum += init;
-        result_sum += POPULATION_TO_GENERATE * init + init_offset * ((POPULATION_TO_GENERATE-1)*POPULATION_TO_GENERATE/2);  // Initial agent values
-        result_sum += POPULATION_TO_GENERATE * STEPS * i;  // Agent values added by steps
+    /**
+     * Check result for each log
+     */
+    const std::map<unsigned int, flamegpu::RunLog> &logs = cuda_ensemble.getLogs();
+    if (!cuda_ensemble.Config().mpi || logs.size() > 0) {
+        unsigned int init_sum = 0, expected_init_sum = 0;
+        uint64_t result_sum = 0, expected_result_sum = 0;
+
+        for (const auto &[i, log] : logs) {
+            const int init = i/10;
+            const int init_offset = 1 - i/50;
+            expected_init_sum += init;
+            expected_result_sum += POPULATION_TO_GENERATE * init + init_offset * ((POPULATION_TO_GENERATE-1)*POPULATION_TO_GENERATE/2);  // Initial agent values
+            expected_result_sum += POPULATION_TO_GENERATE * STEPS * i;  // Agent values added by steps
+            const flamegpu::ExitLogFrame &exit_log = log.getExitLog();
+            init_sum += exit_log.getEnvironmentProperty<int>("init");
+            result_sum += exit_log.getEnvironmentProperty<int>("result");
+        }
+        printf("Ensemble init: %u, calculated init %u\n", expected_init_sum, init_sum);
+        printf("Ensemble result: %zu, calculated result %zu\n", expected_result_sum, result_sum);
+    }
+    /**
+     * Report if MPI was enabled
+     */
+    if (cuda_ensemble.Config().mpi) {
+        printf("Local MPI runner completed %u/%u runs.\n", static_cast<unsigned int>(logs.size()), static_cast<unsigned int>(runs.size()));
     }
-    printf("Ensemble init: %u, calculated init %u\n", atomic_init.load(), init_sum);
-    printf("Ensemble result: %zu, calculated result %zu\n", atomic_result.load(), result_sum);
 
     // Ensure profiling / memcheck work correctly
     flamegpu::util::cleanup();
diff --git a/include/flamegpu/simulation/CUDAEnsemble.h b/include/flamegpu/simulation/CUDAEnsemble.h
index dafd00e9e..848261b94 100644
--- a/include/flamegpu/simulation/CUDAEnsemble.h
+++ b/include/flamegpu/simulation/CUDAEnsemble.h
@@ -1,6 +1,7 @@
 #ifndef INCLUDE_FLAMEGPU_SIMULATION_CUDAENSEMBLE_H_
 #define INCLUDE_FLAMEGPU_SIMULATION_CUDAENSEMBLE_H_
 
+#include <map>
 #include <string>
 #include <memory>
 #include <set>
@@ -9,7 +10,6 @@
 #include "flamegpu/defines.h"
 
 namespace flamegpu {
-
 struct ModelData;
 class ModelDescription;
 class RunPlanVector;
@@ -87,6 +87,15 @@ class CUDAEnsemble {
 #else
         const bool block_standby = false;
 #endif
+        /**
+         * Flag which denotes whether MPI support is enabled
+         * @note This cannot be changed at runtime, depends on compilation settings.
+         */
+#ifdef FLAMEGPU_ENABLE_MPI
+        const bool mpi = true;
+#else
+        const bool mpi = false;
+#endif
 
         bool telemetry = false;
     };
@@ -151,7 +160,7 @@ class CUDAEnsemble {
     /**
      * Return the list of logs collected from the last call to simulate()
      */
-    const std::vector<RunLog> &getLogs();
+    const std::map<unsigned int, RunLog> &getLogs();
 
  private:
     /**
@@ -181,9 +190,9 @@ class CUDAEnsemble {
      */
     std::shared_ptr<const LoggingConfig> exit_log_config;
     /**
-     * Logs collected by simulate()
+     * Logs collected by simulate(), where the map's key corresponds to the index of the associated RunPlan
      */
-    std::vector<RunLog> run_logs;
+    std::map<unsigned int, RunLog> run_logs;
     /**
      * Model description hierarchy for the ensemble, a copy of this will be passed to every CUDASimulation
      */
diff --git a/include/flamegpu/simulation/CUDASimulation.h b/include/flamegpu/simulation/CUDASimulation.h
index 76c246c59..958ef01a6 100644
--- a/include/flamegpu/simulation/CUDASimulation.h
+++ b/include/flamegpu/simulation/CUDASimulation.h
@@ -37,7 +37,7 @@
 
 namespace flamegpu {
 namespace detail {
-class SimRunner;
+class AbstractSimRunner;
 class CUDAAgent;
 class CUDAMessage;
 }  // namespace detail
@@ -62,7 +62,7 @@ class CUDASimulation : public Simulation {
     /**
      * Requires internal access to getCUDAAgent()
      */
-    friend class detail::SimRunner;
+    friend class detail::AbstractSimRunner;
     friend class CUDAEnsemble;
 #ifdef FLAMEGPU_VISUALISATION
     /**
@@ -98,7 +98,7 @@ class CUDASimulation : public Simulation {
      * CUDA runner specific config
      */
     struct Config {
-        friend class detail::SimRunner;
+        friend class detail::AbstractSimRunner;
         friend class CUDASimulation;
         friend class HostAPI;
         /**
diff --git a/include/flamegpu/simulation/RunPlan.h b/include/flamegpu/simulation/RunPlan.h
index e9d860fa0..24ccd3da4 100644
--- a/include/flamegpu/simulation/RunPlan.h
+++ b/include/flamegpu/simulation/RunPlan.h
@@ -15,7 +15,7 @@
 
 namespace flamegpu {
 namespace detail {
-class SimRunner;
+class AbstractSimRunner;
 }
 class ModelDescription;
 class RunPlanVector;
@@ -31,7 +31,7 @@ class XMLLogger;
  */
 class RunPlan {
     friend class RunPlanVector;
-    friend class detail::SimRunner;
+    friend class detail::AbstractSimRunner;
     friend class CUDASimulation;
     friend class io::JSONLogger;
     friend class io::XMLLogger;
@@ -264,7 +264,7 @@ void RunPlan::setProperty(const std::string &name, const flamegpu::size_type ind
     }
     const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (it->second.data.elements < t_index || t_index < index) {
-        throw exception::OutOfBoundsException("Environment property array index out of bounds "
+        THROW exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlan::setProperty()\n");
     }
     // Check whether array already exists in property overrides
@@ -383,7 +383,7 @@ T RunPlan::getProperty(const std::string &name, const flamegpu::size_type index)
     }
     const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (it->second.data.elements < t_index || t_index < index) {
-        throw exception::OutOfBoundsException("Environment property array index out of bounds "
+        THROW exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlan::getProperty()\n");
     }
     // Check whether property already exists in property overrides
diff --git a/include/flamegpu/simulation/RunPlanVector.h b/include/flamegpu/simulation/RunPlanVector.h
index fd0b2bfd5..166e252c0 100644
--- a/include/flamegpu/simulation/RunPlanVector.h
+++ b/include/flamegpu/simulation/RunPlanVector.h
@@ -25,7 +25,7 @@ class EnvironmentDescription;
  */
 class RunPlanVector : private std::vector<RunPlan>  {
     friend class RunPlan;
-    friend class detail::SimRunner;
+    friend class detail::AbstractSimRunner;
     friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);
 
  public:
@@ -406,7 +406,7 @@ void RunPlanVector::setProperty(const std::string &name, const flamegpu::size_ty
     }
     const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
-        throw exception::OutOfBoundsException("Environment property array index out of bounds "
+        THROW exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setProperty()\n");
     }
     for (auto &i : *this) {
@@ -492,7 +492,7 @@ void RunPlanVector::setPropertyLerpRange(const std::string &name, const flamegpu
     }
     const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
-        throw exception::OutOfBoundsException("Environment property array index out of bounds "
+        THROW exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setPropertyLerpRange()\n");
     }
     unsigned int ct = 0;
@@ -602,7 +602,7 @@ void RunPlanVector::setPropertyRandom(const std::string &name, const flamegpu::s
     }
     const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
-        throw exception::OutOfBoundsException("Environment property array index out of bounds "
+        THROW exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setPropertyRandom()\n");
     }
     for (auto &i : *this) {
diff --git a/include/flamegpu/simulation/detail/AbstractSimRunner.h b/include/flamegpu/simulation/detail/AbstractSimRunner.h
new file mode 100644
index 000000000..1b854d507
--- /dev/null
+++ b/include/flamegpu/simulation/detail/AbstractSimRunner.h
@@ -0,0 +1,193 @@
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_
+
+#include <thread>
+#include <mutex>
+#include <queue>
+#include <condition_variable>
+#include <vector>
+#include <map>
+#include <memory>
+
+#ifdef FLAMEGPU_ENABLE_MPI
+#include <mpi.h>
+#endif
+
+#include "flamegpu/defines.h"
+#include "flamegpu/simulation/LogFrame.h"
+
+namespace flamegpu {
+struct ModelData;
+class LoggingConfig;
+class StepLoggingConfig;
+class RunPlanVector;
+class CUDAEnsemble;
+namespace detail {
+/**
+* Common interface and implementation shared between SimRunner and MPISimRunner
+*/
+class AbstractSimRunner {
+    friend class flamegpu::CUDAEnsemble;
+    /**
+     * Create a new thread and trigger main() to execute the SimRunner
+     */
+    void start();
+
+ public:
+    struct ErrorDetail {
+        unsigned int run_id;
+        unsigned int device_id;
+        unsigned int runner_id;
+        char exception_string[1024];
+    };
+#ifdef FLAMEGPU_ENABLE_MPI
+    static MPI_Datatype createErrorDetailMPIDatatype() {
+        static MPI_Datatype rtn = MPI_DATATYPE_NULL;
+        if (rtn == MPI_DATATYPE_NULL) {
+            ErrorDetail t;
+            constexpr int count = 4;
+            constexpr int array_of_blocklengths[count] = {1, 1, 1, sizeof(ErrorDetail::exception_string)};
+            const char* t_ptr = reinterpret_cast<char*>(&t);
+            const MPI_Aint array_of_displacements[count] = { reinterpret_cast<char*>(&t.run_id) - t_ptr,
+                                                             reinterpret_cast<char*>(&t.device_id) - t_ptr,
+                                                             reinterpret_cast<char*>(&t.runner_id) - t_ptr,
+                                                             reinterpret_cast<char*>(&t.exception_string) - t_ptr };
+            const MPI_Datatype array_of_types[count] = {MPI_UNSIGNED, MPI_UNSIGNED, MPI_UNSIGNED, MPI_CHAR};
+            MPI_Type_create_struct(count, array_of_blocklengths, array_of_displacements, array_of_types, &rtn);
+            MPI_Type_commit(&rtn);
+        }
+        return rtn;
+    }
+#endif
+
+    /**
+     * Constructor, creates and initialises the underlying thread
+     * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
+     * @param _err_ct Reference to an atomic integer for tracking how many errors have occurred
+     * @param _next_run Atomic counter for safely selecting the next run plan to execute across multiple threads
+     * @param _plans The vector of run plans to be executed by the ensemble
+     * @param _step_log_config The config of which data should be logged each step
+     * @param _exit_log_config The config of which data should be logged at run exit
+     * @param _device_id The GPU that all runs should execute on
+     * @param _runner_id A unique index assigned to the runner
+     * @param _verbosity Verbosity level (Verbosity::Quiet, Verbosity::Default, Verbosity::Verbose)
+     * @param run_logs Reference to the vector to store generate run logs
+     * @param log_export_queue The queue of logs to exported to disk
+     * @param log_export_queue_mutex This mutex must be locked to access log_export_queue
+     * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
+     * @param err_detail Structure to store error details on fast failure for main thread rethrow
+     * @param _total_runners Total number of runners executing
+     * @param _isSWIG Flag denoting whether it's a Python build of FLAMEGPU
+     */
+    AbstractSimRunner(const std::shared_ptr<const ModelData> _model,
+        std::atomic<unsigned int> &_err_ct,
+        std::atomic<unsigned int> &_next_run,
+        const RunPlanVector &_plans,
+        std::shared_ptr<const StepLoggingConfig> _step_log_config,
+        std::shared_ptr<const LoggingConfig> _exit_log_config,
+        int _device_id,
+        unsigned int _runner_id,
+        flamegpu::Verbosity _verbosity,
+        std::map<unsigned int, RunLog> &run_logs,
+        std::queue<unsigned int> &log_export_queue,
+        std::mutex &log_export_queue_mutex,
+        std::condition_variable &log_export_queue_cdn,
+        std::vector<ErrorDetail> &err_detail,
+        unsigned int _total_runners,
+        bool _isSWIG);
+    /**
+     * Virtual class requires polymorphic destructor
+     */
+    virtual ~AbstractSimRunner() {}
+    /**
+     * Subclass implementation of SimRunner
+     */
+    virtual void main() = 0;
+    /**
+     * Blocking call which if thread->joinable() triggers thread->join() 
+     */
+    void join();
+
+ protected:
+    /**
+     * Create and execute the simulation for the RunPlan within plans of given index
+     * @throws Exceptions during sim execution may be raised, these should be caught and handled by the caller
+     */
+    void runSimulation(int plan_id);
+    /**
+    * The thread which the SimRunner executes on
+    */
+    std::thread thread;
+    /**
+    * Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
+    */
+    const std::shared_ptr<const ModelData> model;
+    /**
+    * CUDA Device index of runner
+    */
+    const int device_id;
+    /**
+    * Per instance unique runner id
+    */
+    const unsigned int runner_id;
+    /**
+    * Total number of runners executing
+    * This is used to calculate the progress on job completion
+    */
+    const unsigned int total_runners;
+    /**
+    * Flag for whether to print progress
+    */
+    const flamegpu::Verbosity verbosity;
+    // External references
+    /**
+     * Reference to an atomic integer for tracking how many errors have occurred
+     */
+    std::atomic<unsigned int> &err_ct;
+    /**
+     * Atomic counter for safely selecting the next run plan to execute across multiple threads
+     * This is used differently by each class of runner
+     */
+    std::atomic<unsigned int> &next_run;
+    /**
+     * Reference to the vector of run configurations to be executed
+     */
+    const RunPlanVector &plans;
+    /**
+     * Config specifying which data to log per step
+     */
+    const std::shared_ptr<const StepLoggingConfig> step_log_config;
+    /**
+     * Config specifying which data to log at run exit
+     */
+    const std::shared_ptr<const LoggingConfig> exit_log_config;
+    /**
+     * Reference to the map to store generated run logs
+     */
+    std::map<unsigned int, RunLog> &run_logs;
+    /**
+     * The queue of logs to exported to disk
+     */
+    std::queue<unsigned int> &log_export_queue;
+    /**
+     * This mutex must be locked to access log_export_queue
+     */
+    std::mutex &log_export_queue_mutex;
+    /**
+     * The condition is notified every time a log has been added to the queue
+     */
+    std::condition_variable &log_export_queue_cdn;
+    /**
+     * Error details will be stored here
+     */
+    std::vector<ErrorDetail>& err_detail;
+    /**
+     * If true, the model is using SWIG Python interface
+     **/
+    const bool isSWIG;
+};
+
+}  // namespace detail
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_
diff --git a/include/flamegpu/simulation/detail/MPIEnsemble.h b/include/flamegpu/simulation/detail/MPIEnsemble.h
new file mode 100644
index 000000000..028e808a0
--- /dev/null
+++ b/include/flamegpu/simulation/detail/MPIEnsemble.h
@@ -0,0 +1,192 @@
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPIENSEMBLE_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPIENSEMBLE_H_
+
+#include <mpi.h>
+
+#include <map>
+#include <set>
+#include <string>
+#include <mutex>
+#include <vector>
+
+#include "flamegpu/simulation/CUDAEnsemble.h"
+#include "flamegpu/simulation/detail/MPISimRunner.h"
+
+namespace flamegpu {
+namespace detail {
+
+class MPIEnsemble {
+    const CUDAEnsemble::EnsembleConfig &config;
+    // Tags to different the MPI messages used in protocol
+    enum EnvelopeTag : int {
+        // Sent from worker to manager to request a job index to process
+        RequestJob = 0,
+        // Sent from manager to worker to assign a job index to process in response to AssignJob
+        AssignJob = 1,
+        // Sent from worker to manager to report an error during job execution
+        // If fail fast is enabled, following RequestJob will receive an exit job id (>=plans.size())
+        ReportError = 2,
+        // Sent from worker to manager to report GPUs for telemetry
+        TelemetryDevices = 3,
+    };
+
+ public:
+    /**
+     * The rank within the world MPI communicator
+     */
+    const int world_rank;
+    /**
+     * The size of the world MPI communicator
+     */
+    const int world_size;
+    /**
+     * The rank within the MPI shared memory communicator (i.e. the within the node)
+     */
+    const int local_rank;
+    /**
+     * The size of the MPI shared memory communicator (i.e. the within the node)
+     */
+    const int local_size;
+    /**
+     * The total number of runs to be executed (only used for printing error warnings)
+     */
+    const unsigned int total_runs;
+    /**
+     * Construct the object for managing MPI comms during an ensemble
+     *
+     * Initialises the MPI_Datatype MPI_ERROR_DETAIL, detects world rank and size.
+     *
+     * @param _config The parent ensemble's config (mostly used to check error/verbosity levels)
+     * @param _total_runs The total number of runs to be executed (only used for printing error warnings)
+     */
+    explicit MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, unsigned int _total_runs);
+    /**
+     * If world_rank==0, receive any waiting errors and add their details to err_detail
+     * @param err_detail The map to store new error details within
+     * @return The number of errors that occurred.
+     */
+    int receiveErrors(std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail);
+    /**
+     * If world_rank==0, receive and process any waiting job requests
+     * @param next_run A reference to the int which tracks the progress through the run plan vector
+     * @return The number of runners that have been told to exit (if next_run>total_runs)
+     */
+    int receiveJobRequests(unsigned int &next_run);
+    /**
+     * If world_rank!=0, send the provided error detail to world_rank==0
+     * @param e_detail The error detail to be sent
+     */
+    void sendErrorDetail(AbstractSimRunner::ErrorDetail &e_detail);
+    /**
+     * If world_rank!=0, request a job from world_rank==0 and return the response
+     * @return The index of the assigned job
+     */
+    int requestJob();
+    /**
+     * Wait for all MPI ranks to reach a barrier
+     */
+    void worldBarrier();
+    /**
+     * If world_rank!=0 and local_rank == 0, send the local GPU string to world_rank==0 and return empty string
+     * If world_rank==0, receive GPU strings and assemble the full remote GPU string to be returned
+     */
+    std::string assembleGPUsString();
+    /**
+     * Common function for handling local errors during MPI execution
+     * Needs the set of in-use devices, not the config specified list of devices
+     */
+    void retrieveLocalErrorDetail(std::mutex &log_export_queue_mutex,
+        std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail,
+        std::vector<AbstractSimRunner::ErrorDetail> &err_detail_local, int i, std::set<int> devices);
+    /**
+     * Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs.
+     * @param isParticipating If this rank is participating (i.e. it has a local device assigned)
+     * @return success of this method
+     */
+    bool createParticipatingCommunicator(bool isParticipating);
+    /**
+     * Accessor method for if the rank is participating or not (i.e. the colour of the communicator split)
+     */
+    int getRankIsParticipating() { return this->rank_is_participating; }
+    /**
+     * Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int getParticipatingCommSize() { return this->participating_size; }
+    /**
+     * Accessor method for the rank within the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int getParticipatingCommRank() { return this->participating_rank; }
+
+    /**
+     * Static method to select devices for the current mpi rank, based on the provided list of devices.
+     * This static version exists so that it is testable. 
+     * A non static version which queries the curernt mpi environment is also provided as a simpler interface
+     * @param devicesToSelectFrom set of device indices to use, provided from the config or initialised to contain all visible devices. 
+     * @param local_size the number of mpi processes on the current shared memory system
+     * @param local_rank the current process' mpi rank within the shared memory system
+     * @return the gpus to be used by the current mpi rank, which may be empty.
+     */
+    static std::set<int> devicesForThisRank(std::set<int> devicesToSelectFrom, int local_size, int local_rank);
+
+    /**
+     * Method to select devices for the current mpi rank, based on the provided list of devices.
+     * This non-static version calls the other overload with the current mpi size/ranks, i.e. this is the version that should be used.
+     * @param devicesToSelectFrom set of device indices to use, provided from the config or initialised to contain all visible devices. 
+     * @return the gpus to be used by the current mpi rank, which may be empty.
+     */
+    std::set<int> devicesForThisRank(std::set<int> devicesToSelectFrom);
+
+ private:
+    /**
+     * @return Retrieve the local world rank from MPI
+     */
+    static int queryMPIWorldRank();
+    /**
+    * @return Retrieve the world size from MPI
+    */
+    static int queryMPIWorldSize();
+    /**
+     * @return Retrieve the local rank within the current shared memory region
+     */
+    static int queryMPISharedGroupRank();
+    /**
+     * @return retrieve the number of mpi processes on the current shared memory region
+     */
+    static int queryMPISharedGroupSize();
+    /**
+     * If necessary initialise MPI, else do nothing
+     */
+    static void initMPI();
+    /**
+     * Iterate the provided set of devices to find the item at index j
+     * This doesn't use the config.devices as it may have been mutated based on the number of mpi ranks used.
+     */
+    unsigned int getDeviceIndex(const int j, std::set<int> devices);
+    /**
+     * MPI representation of AbstractSimRunner::ErrorDetail type
+     */
+    const MPI_Datatype MPI_ERROR_DETAIL;
+    /**
+     * flag indicating if the current MPI rank is a participating rank (i.e. it has atleast one GPU it can use).
+     * This is not a const public member, as it can only be computed after world and local rank / sizes and local gpu count are known.
+     * This is used to form the colour in the participating communicator 
+     */
+    bool rank_is_participating;
+    /**
+     * An MPI communicator, split by whether the mpi rank is participating in simulation or not.
+     * non-participating ranks will have a commincator which only contains non-participating ranks, but these will never use the communicator
+     */
+    MPI_Comm comm_participating;
+    /**
+     * The size of the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int participating_size;
+    /**
+     * The rank within the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int participating_rank;
+};
+}  // namespace detail
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPIENSEMBLE_H_
diff --git a/include/flamegpu/simulation/detail/MPISimRunner.h b/include/flamegpu/simulation/detail/MPISimRunner.h
new file mode 100644
index 000000000..753b6805e
--- /dev/null
+++ b/include/flamegpu/simulation/detail/MPISimRunner.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <condition_variable>
+#include <vector>
+#include <map>
+
+#include "flamegpu/simulation/detail/AbstractSimRunner.h"
+#include "flamegpu/defines.h"
+#include "flamegpu/simulation/LogFrame.h"
+
+namespace flamegpu {
+struct ModelData;
+class LoggingConfig;
+class StepLoggingConfig;
+class RunPlanVector;
+class CUDAEnsemble;
+namespace detail {
+
+/**
+ * A thread class which executes RunPlans on a single GPU, communicating with the main-thread which has jobs allocated via MPI
+ *
+ * This class is used by CUDAEnsemble, it creates one SimRunner instance per GPU, each executes in a separate thread.
+ * There may be multiple instances per GPU, if running small models on large GPUs.
+ */
+class MPISimRunner : public AbstractSimRunner {
+ public:
+    enum Signal : unsigned int {
+        // MPISimRunner sets this to notify manager that it wants a new job
+        RequestJob = UINT_MAX,
+        RunFailed = UINT_MAX-1,
+    };
+    /**
+     * Constructor, creates and initialise a new MPISimRunner
+     * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
+     * @param _err_ct Reference to an atomic integer for tracking how many errors have occurred
+     * @param _next_run Atomic counter for safely selecting the next run plan to execute across multiple threads
+     * @param _plans The vector of run plans to be executed by the ensemble
+     * @param _step_log_config The config of which data should be logged each step
+     * @param _exit_log_config The config of which data should be logged at run exit
+     * @param _device_id The GPU that all runs should execute on
+     * @param _runner_id A unique index assigned to the runner
+     * @param _verbosity Verbosity level (Verbosity::Quiet, Verbosity::Default, Verbosity::Verbose)
+     * @param run_logs Reference to the vector to store generate run logs
+     * @param log_export_queue The queue of logs to exported to disk
+     * @param log_export_queue_mutex This mutex must be locked to access log_export_queue
+     * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
+     * @param err_detail_local Structure to store error details on failure for main thread to handle
+     * @param _total_runners Total number of runners executing
+     * @param _isSWIG Flag denoting whether it's a Python build of FLAMEGPU
+     */
+    MPISimRunner(const std::shared_ptr<const ModelData> _model,
+        std::atomic<unsigned int> &_err_ct,
+        std::atomic<unsigned int> &_next_run,
+        const RunPlanVector &_plans,
+        std::shared_ptr<const StepLoggingConfig> _step_log_config,
+        std::shared_ptr<const LoggingConfig> _exit_log_config,
+        int _device_id,
+        unsigned int _runner_id,
+        flamegpu::Verbosity _verbosity,
+        std::map<unsigned int, RunLog> &run_logs,
+        std::queue<unsigned int> &log_export_queue,
+        std::mutex &log_export_queue_mutex,
+        std::condition_variable &log_export_queue_cdn,
+        std::vector<ErrorDetail> &err_detail_local,
+        unsigned int _total_runners,
+        bool _isSWIG);
+    /**
+     * SimRunner loop with MPI comm with local manager
+     */
+    void main() override;
+};
+
+}  // namespace detail
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_
diff --git a/include/flamegpu/simulation/detail/SimLogger.h b/include/flamegpu/simulation/detail/SimLogger.h
index 339bacd76..04fc9b09b 100644
--- a/include/flamegpu/simulation/detail/SimLogger.h
+++ b/include/flamegpu/simulation/detail/SimLogger.h
@@ -1,12 +1,12 @@
 #ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMLOGGER_H_
 #define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMLOGGER_H_
 
-#include <vector>
 #include <thread>
 #include <mutex>
 #include <queue>
 #include <string>
 #include <condition_variable>
+#include <map>
 
 #include "flamegpu/simulation/LogFrame.h"
 
@@ -34,7 +34,7 @@ class SimLogger {
      * @param _export_step_time If true step log time will be exported
      * @param _export_exit_time If true exit log time will be exported
      */
-    SimLogger(const std::vector<RunLog> &run_logs,
+    SimLogger(const std::map<unsigned int, RunLog> &run_logs,
         const RunPlanVector &run_plans,
         const std::string &out_directory,
         const std::string &out_format,
@@ -55,9 +55,9 @@ class SimLogger {
     void start();
     // External references
     /**
-     * Reference to the vector to store generate run logs
+     * Reference to the map to store generate run logs
      */
-    const std::vector<RunLog> &run_logs;
+    const std::map<unsigned int, RunLog> &run_logs;
     /**
      * Reference to the vector of run configurations to be executed
      */
diff --git a/include/flamegpu/simulation/detail/SimRunner.h b/include/flamegpu/simulation/detail/SimRunner.h
index 7943d5dc1..2fc29c789 100644
--- a/include/flamegpu/simulation/detail/SimRunner.h
+++ b/include/flamegpu/simulation/detail/SimRunner.h
@@ -6,12 +6,12 @@
 #include <mutex>
 #include <queue>
 #include <condition_variable>
-#include <thread>
+#include <map>
 #include <vector>
-#include <string>
 
 #include "flamegpu/defines.h"
 #include "flamegpu/simulation/LogFrame.h"
+#include "flamegpu/simulation/detail/AbstractSimRunner.h"
 
 namespace flamegpu {
 struct ModelData;
@@ -27,14 +27,13 @@ namespace detail {
  * This class is used by CUDAEnsemble, it creates one SimRunner instance per GPU, each executes in a separate thread.
  * There may be multiple instances per GPU, if running small models on large GPUs.
  */
-class SimRunner {
-    friend class flamegpu::CUDAEnsemble;
-    struct ErrorDetail {
-        unsigned int run_id;
-        unsigned int device_id;
-        unsigned int runner_id;
-        std::string exception_string;
-    };
+class SimRunner : public AbstractSimRunner {
+    /**
+    * Flag for whether the ensemble should throw an exception if it errors out
+    */
+    const bool fail_fast;
+
+ public:
     /**
      * Constructor, creates and initialise a new SimRunner
      * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
@@ -51,7 +50,7 @@ class SimRunner {
      * @param log_export_queue The queue of logs to exported to disk
      * @param log_export_queue_mutex This mutex must be locked to access log_export_queue
      * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
-     * @param fast_err_detail Structure to store error details on fast failure for main thread rethrow
+     * @param err_detail Structure to store error details on fast failure for main thread rethrow
      * @param _total_runners Total number of runners executing
      * @param _isSWIG Flag denoting whether it's a Python build of FLAMEGPU
      */
@@ -65,95 +64,17 @@ class SimRunner {
         unsigned int _runner_id,
         flamegpu::Verbosity _verbosity,
         bool _fail_fast,
-        std::vector<RunLog> &run_logs,
+        std::map<unsigned int, RunLog> &run_logs,
         std::queue<unsigned int> &log_export_queue,
         std::mutex &log_export_queue_mutex,
         std::condition_variable &log_export_queue_cdn,
-        ErrorDetail &fast_err_detail,
+        std::vector<ErrorDetail> &err_detail,
         unsigned int _total_runners,
         bool _isSWIG);
     /**
-     * Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
-     */
-    const std::shared_ptr<const ModelData> model;
-    /**
-     * Index of current/last run in RunPlanVector executed by this SimRunner
-     */
-    unsigned int run_id;
-    /**
-     * CUDA Device index of runner
-     */
-    const int device_id;
-    /**
-     * Per instance unique runner id
-     */
-    const unsigned int runner_id;
-    /**
-     * Total number of runners executing
-     * This is used to calculate the progress on job completion
+     * SimRunner loop with shared next_run atomic
      */
-    const unsigned int total_runners;
-    /**
-     * Flag for whether to print progress
-     */
-    const flamegpu::Verbosity verbosity;
-    /**
-     * Flag for whether the ensemble should throw an exception if it errors out
-     */
-    const bool fail_fast;
-    /**
-     * The thread which the SimRunner executes on
-     */
-    std::thread thread;
-    /**
-     * Start executing the SimRunner in it's separate thread
-     */
-    void start();
-    // External references
-    /**
-     * Reference to an atomic integer for tracking how many errors have occurred
-     */
-    std::atomic<unsigned int> &err_ct;
-    /**
-     * Atomic counter for safely selecting the next run plan to execute across multiple threads
-     */
-    std::atomic<unsigned int> &next_run;
-    /**
-     * Reference to the vector of run configurations to be executed
-     */
-    const RunPlanVector &plans;
-    /**
-     * Config specifying which data to log per step
-     */
-    const std::shared_ptr<const StepLoggingConfig> step_log_config;
-    /**
-     * Config specifying which data to log at run exit
-     */
-    const std::shared_ptr<const LoggingConfig> exit_log_config;
-    /**
-     * Reference to the vector to store generate run logs
-     */
-    std::vector<RunLog> &run_logs;
-    /**
-     * The queue of logs to exported to disk
-     */
-    std::queue<unsigned int> &log_export_queue;
-    /**
-     * This mutex must be locked to access log_export_queue
-     */
-    std::mutex &log_export_queue_mutex;
-    /**
-     * The condition is notified every time a log has been added to the queue
-     */
-    std::condition_variable &log_export_queue_cdn;
-    /**
-     * If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread
-     */
-    ErrorDetail& fast_err_detail;
-    /**
-     * If true, the model is using SWIG Python interface
-     **/
-    const bool isSWIG;
+    void main() override;
 };
 
 }  // namespace detail
diff --git a/include/flamegpu/util/cleanup.h b/include/flamegpu/util/cleanup.h
index f25b18f33..5dfa0326f 100644
--- a/include/flamegpu/util/cleanup.h
+++ b/include/flamegpu/util/cleanup.h
@@ -9,7 +9,7 @@ namespace flamegpu {
 namespace util {
 
 /**
- * Method to cleanup / finalise use of FLAMEGPU. 
+ * Method to cleanup / finalise use of FLAMEGPU (and MPI). 
  * For CUDA implementations, this resets all CUDA devices in the current system, to ensure that CUDA tools such as cuda-memcheck, compute-sanitizer and Nsight Compute.
  * 
  * This method should ideally be called as the final method prior to an `exit` or the `return` of the main method, as it is costly and can invalidate device memory allocations, potentially breaking application state.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 99385c155..d7a791166 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,6 +96,9 @@ endif()
 option(FLAMEGPU_ENABLE_LINT_FLAMEGPU "Enable creation of the lint_flamegpu target" ${FLAMEGPU_ENABLE_LINT_FLAMEGPU_DEFAULT})
 unset(FLAMEGPU_ENABLE_LINT_FLAMEGPU_DEFAULT)
 
+# Option to enable/disable MPI support
+option(FLAMEGPU_ENABLE_MPI "Enable MPI support for distributed ensembles" OFF)
+
 # Option to enable GLM support and put GLM on the include path
 option(FLAMEGPU_ENABLE_GLM "Experimental: Make GLM available to flamegpu2 projects on the include path" OFF)
 mark_as_advanced(FLAMEGPU_ENABLE_GLM)
@@ -186,6 +189,7 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/model/Variable.h
     ${FLAMEGPU_ROOT}/include/flamegpu/model/EnvironmentDirectedGraphData.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/model/EnvironmentDirectedGraphDescription.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/AbstractSimRunner.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/MemoryVector.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/GenericMemoryVector.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAScanCompaction.h
@@ -199,6 +203,8 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAFatAgentStateList.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAScatter.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAMacroEnvironment.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/MPISimRunner.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/MPIEnsemble.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/SimRunner.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/SimLogger.h
     ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/AgentInterface.h
@@ -317,6 +323,7 @@ SET(SRC_FLAMEGPU
     ${FLAMEGPU_ROOT}/src/flamegpu/model/HostFunctionDescription.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/model/EnvironmentDirectedGraphData.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/model/EnvironmentDirectedGraphDescription.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/AbstractSimRunner.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CubTemporaryMemory.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAScanCompaction.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAMessageList.cu
@@ -329,6 +336,8 @@ SET(SRC_FLAMEGPU
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAMacroEnvironment.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/DeviceStrings.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/MPISimRunner.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/MPIEnsemble.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/SimRunner.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/SimLogger.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/EnvironmentManager.cu
@@ -573,6 +582,31 @@ if (FLAMEGPU_VISUALISATION)
     target_compile_definitions(${PROJECT_NAME} PUBLIC FLAMEGPU_VISUALISATION) 
 endif()
 
+# Activate MPI support (distributed ensembles)
+if (FLAMEGPU_ENABLE_MPI)
+    find_package(MPI REQUIRED)
+    if (MPI_CXX_FOUND)
+        # If CMake is older than 3.20.1, FindMPI may incorrectly wrap link options when CUDA is enabled.
+        # It is not guaranteed to be a problem, so we should warn rather than error.
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/21887
+        if(CMAKE_VERSION VERSION_LESS 3.20.1)
+            message(WARNING "CMake < 3.20.1 may result in link errors with FLAMEGPU_ENABLE_MPI=ON for some MPI installations. Consider using CMake >= 3.20.1.")
+        endif()
+        # If the MPI installation brings in -flto (i.e. Ubuntu 22.04 libmpich-dev), warn about it and suggest a reconfiguration.
+        if(MPI_CXX_COMPILE_OPTIONS MATCHES ".*\-flto.*")
+            message(WARNING
+                " MPI_CXX_COMPILE_OPTIONS contains '-flto' which is likely to result in linker errors. \n"
+                " Consider an alternate MPI implementation which does not embed -flto,\n"
+                " Or reconfiguring CMake with -DMPI_CXX_COMPILE_OPTIONS=\"\" if linker error occur.")
+        endif()
+        # This sets up include directories and lib
+        target_link_libraries(${PROJECT_NAME} PUBLIC MPI::MPI_CXX)
+        target_compile_definitions(${PROJECT_NAME} PUBLIC FLAMEGPU_ENABLE_MPI)
+    else()
+        message(FATAL_ERROR "MPI C++ Support was not found!")
+    endif()
+endif()
+
 # Make GLM accessible via include. PUBLIC so this is usable by downstream projects
 # @todo - make the vis cmake/glm create a target to use.
 if (FLAMEGPU_ENABLE_GLM)
diff --git a/src/flamegpu/io/XMLLogger.cu b/src/flamegpu/io/XMLLogger.cu
index 49fc4cb79..26988e660 100644
--- a/src/flamegpu/io/XMLLogger.cu
+++ b/src/flamegpu/io/XMLLogger.cu
@@ -15,9 +15,9 @@ namespace io {
     switch (a_eResult) { \
     case tinyxml2::XML_ERROR_FILE_NOT_FOUND : \
     case tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_FILE_READ_ERROR : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_PARSING_ELEMENT : \
     case tinyxml2::XML_ERROR_PARSING_ATTRIBUTE : \
     case tinyxml2::XML_ERROR_PARSING_TEXT : \
@@ -26,25 +26,25 @@ namespace io {
     case tinyxml2::XML_ERROR_PARSING_DECLARATION : \
     case tinyxml2::XML_ERROR_PARSING_UNKNOWN : \
     case tinyxml2::XML_ERROR_PARSING : \
-        throw exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_EMPTY_DOCUMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_MISMATCHED_ELEMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_CAN_NOT_CONVERT_TEXT : \
-        throw exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_TEXT_NODE : \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ELEMENT_DEPTH_EXCEEDED : \
-        throw exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_COUNT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_ATTRIBUTE: \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_WRONG_ATTRIBUTE_TYPE : \
-        throw exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
     default: \
-        throw exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
     } \
 }
 #endif
diff --git a/src/flamegpu/io/XMLStateReader.cu b/src/flamegpu/io/XMLStateReader.cu
index 46a1dc2aa..b93b07ef0 100644
--- a/src/flamegpu/io/XMLStateReader.cu
+++ b/src/flamegpu/io/XMLStateReader.cu
@@ -22,9 +22,9 @@ namespace io {
     switch (a_eResult) { \
     case tinyxml2::XML_ERROR_FILE_NOT_FOUND : \
     case tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_FILE_READ_ERROR : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_PARSING_ELEMENT : \
     case tinyxml2::XML_ERROR_PARSING_ATTRIBUTE : \
     case tinyxml2::XML_ERROR_PARSING_TEXT : \
@@ -33,25 +33,25 @@ namespace io {
     case tinyxml2::XML_ERROR_PARSING_DECLARATION : \
     case tinyxml2::XML_ERROR_PARSING_UNKNOWN : \
     case tinyxml2::XML_ERROR_PARSING : \
-        throw exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_EMPTY_DOCUMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_MISMATCHED_ELEMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_CAN_NOT_CONVERT_TEXT : \
-        throw exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_TEXT_NODE : \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ELEMENT_DEPTH_EXCEEDED : \
-        throw exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_COUNT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_ATTRIBUTE: \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_WRONG_ATTRIBUTE_TYPE : \
-        throw exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
     default: \
-        throw exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
     } \
 }
 #endif
diff --git a/src/flamegpu/io/XMLStateWriter.cu b/src/flamegpu/io/XMLStateWriter.cu
index a0b3afdd9..22e7959e3 100644
--- a/src/flamegpu/io/XMLStateWriter.cu
+++ b/src/flamegpu/io/XMLStateWriter.cu
@@ -21,9 +21,9 @@ namespace io {
     switch (a_eResult) { \
     case tinyxml2::XML_ERROR_FILE_NOT_FOUND : \
     case tinyxml2::XML_ERROR_FILE_COULD_NOT_BE_OPENED : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be opened.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_FILE_READ_ERROR : \
-        throw exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
+        THROW exception::InvalidInputFile("TinyXML error: File could not be read.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_PARSING_ELEMENT : \
     case tinyxml2::XML_ERROR_PARSING_ATTRIBUTE : \
     case tinyxml2::XML_ERROR_PARSING_TEXT : \
@@ -32,25 +32,25 @@ namespace io {
     case tinyxml2::XML_ERROR_PARSING_DECLARATION : \
     case tinyxml2::XML_ERROR_PARSING_UNKNOWN : \
     case tinyxml2::XML_ERROR_PARSING : \
-        throw exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Error parsing file.\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_EMPTY_DOCUMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_EMPTY_DOCUMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_MISMATCHED_ELEMENT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_MISMATCHED_ELEMENT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_CAN_NOT_CONVERT_TEXT : \
-        throw exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_CAN_NOT_CONVERT_TEXT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_TEXT_NODE : \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_TEXT_NODE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ELEMENT_DEPTH_EXCEEDED : \
-        throw exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ELEMENT_DEPTH_EXCEEDED\n Error code: %d", a_eResult); \
     case tinyxml2::XML_ERROR_COUNT : \
-        throw exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_ERROR_COUNT\n Error code: %d", a_eResult); \
     case tinyxml2::XML_NO_ATTRIBUTE: \
-        throw exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_NO_ATTRIBUTE\n Error code: %d", a_eResult); \
     case tinyxml2::XML_WRONG_ATTRIBUTE_TYPE : \
-        throw exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: XML_WRONG_ATTRIBUTE_TYPE\n Error code: %d", a_eResult); \
     default: \
-        throw exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
+        THROW exception::TinyXMLError("TinyXML error: Unrecognised error code\n Error code: %d", a_eResult); \
     } \
 }
 #endif
diff --git a/src/flamegpu/simulation/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu
index 1a6d456b0..ae6074a80 100644
--- a/src/flamegpu/simulation/CUDAEnsemble.cu
+++ b/src/flamegpu/simulation/CUDAEnsemble.cu
@@ -11,6 +11,11 @@
 #include <filesystem>
 #include <map>
 
+#ifdef FLAMEGPU_ENABLE_MPI
+#include "flamegpu/simulation/detail/MPIEnsemble.h"
+#include "flamegpu/simulation/detail/MPISimRunner.h"
+#endif
+
 #include "flamegpu/version.h"
 #include "flamegpu/model/ModelDescription.h"
 #include "flamegpu/simulation/RunPlanVector.h"
@@ -26,7 +31,6 @@
 #include "flamegpu/io/Telemetry.h"
 
 namespace flamegpu {
-
 CUDAEnsemble::EnsembleConfig::EnsembleConfig()
     : telemetry(flamegpu::io::Telemetry::isEnabled()) {}
 
@@ -46,9 +50,7 @@ CUDAEnsemble::~CUDAEnsemble() {
 #endif
 }
 
-
-
-unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
+unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
 #ifdef _MSC_VER
     if (config.block_standby) {
         // This thread requires the system continuously until it exits
@@ -59,8 +61,17 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     if (*plans.environment != this->model->environment->properties) {
         THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()");
     }
+
+#ifdef FLAMEGPU_ENABLE_MPI
+    std::unique_ptr<detail::MPIEnsemble> mpi = std::make_unique<detail::MPIEnsemble>(config, static_cast<unsigned int>(plans.size()));
+#endif
+
     // Validate/init output directories
-    if (!config.out_directory.empty()) {
+    if (!config.out_directory.empty()
+#ifdef FLAMEGPU_ENABLE_MPI
+        && (!config.mpi || mpi->world_rank == 0)
+#endif
+    ) {
         // Validate out format is right
         config.out_format = io::StateWriterFactory::detectSupportedFileExt(config.out_format);
         if (config.out_format.empty()) {
@@ -84,13 +95,13 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
                         step_path /= std::filesystem::path(plans[p].getOutputSubdirectory());
                     step_path /= std::filesystem::path(std::to_string(p) + "." + config.out_format);
                     if (std::filesystem::exists(step_path)) {
-                        THROW exception::FileAlreadyExists("Step log file '%s' already exists, in CUDAEnsemble::simulate()", step_path.c_str());
+                        THROW exception::FileAlreadyExists("Step log file '%s' already exists, in CUDAEnsemble::simulate()", step_path.generic_string().c_str());
                     }
                 }
                 // Exit
                 for (const auto &exit_path : exit_files) {
                     if (std::filesystem::exists(exit_path)) {
-                        THROW exception::FileAlreadyExists("Exit log file '%s' already exists, in CUDAEnsemble::simulate()", exit_path.c_str());
+                        THROW exception::FileAlreadyExists("Exit log file '%s' already exists, in CUDAEnsemble::simulate()", exit_path.generic_string().c_str());
                     }
                 }
             } else {
@@ -122,8 +133,8 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     // Purge run logs, and resize ready for new runs
     // Resize means we can setup logs during execution out of order, without risk of list being reallocated
     run_logs.clear();
-    run_logs.resize(plans.size());
     // Workout how many devices and runner we will be executing
+    // if MPI is enabled, This will throw exceptions if any rank has 0 GPUs visible, prior to device allocation preventing issues where rank 0 would not be participating.
     int device_count = -1;
     cudaError_t cudaStatus = cudaGetDeviceCount(&device_count);
     if (cudaStatus != cudaSuccess) {
@@ -138,14 +149,24 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
         }
     }
 
+    // Select the actual devices to be used, based on user provided gpus, architecture compatibility, and optionally mpi ranks per node.
+    // For non-mpi builds / configurations, just use all the devices provided by the user / all visible devices (then check they are valid later)
+    // For MPI builds with mpi enabled, load balance the gpus across mpi ranks within the shared memory system. If there are more ranks than gpus, latter ranks will not participate.
     std::set<int> devices;
+    // initialise the local devices set to be the non-mpi behaviour, using config.devices or all visible cuda devices
     if (config.devices.size()) {
         devices = config.devices;
     } else {
+        // If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
         for (int i = 0; i < device_count; ++i) {
             devices.emplace(i);
         }
     }
+#ifdef FLAMEGPU_ENABLE_MPI
+    // if MPI is enabled at compile time, use the MPIEnsemble method to assign devices balanced across ranks
+    devices = mpi->devicesForThisRank(devices);
+#endif  // ifdef FLAMEGPU_ENABLE_MPI
+
     // Check that each device is capable, and init cuda context
     for (auto d = devices.begin(); d != devices.end(); ++d) {
         if (!detail::compute_capability::checkComputeCapability(*d)) {
@@ -160,11 +181,31 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     // Return to device 0 (or check original device first?)
     gpuErrchk(cudaSetDevice(0));
 
-    // Init runners, devices * concurrent runs
-    std::atomic<unsigned int> err_ct = {0};
-    std::atomic<unsigned int> next_run = {0};
+    // If there are no devices left (and mpi is not being used), we need to error as the work cannot be executed.
+#ifndef FLAMEGPU_ENABLE_MPI
+    if (devices.size() == 0) {
+        THROW exception::InvalidCUDAdevice("FLAMEGPU2 has not been built with an appropraite compute capability for any devices, unable to continue\n");
+    }
+#endif  // ifndef FLAMEGPU_ENABLE_MPI
+
+#ifdef FLAMEGPU_ENABLE_MPI
+    // Once the number of devices per rank is known, we can create the actual communicator to be used during MPI, so we can warn/error as needed.
+    // This rank is participating if it has atleast one device assigned to it.
+    // Rank 0 will be participating at this point, otherwise InvalidCUDAdevice would have been thrown
+    // This also implies the participating communicator cannot have a size of 0, as atleast one thread must be participating at this point, but throw in that case just in case.
+    bool communicatorCreated = mpi->createParticipatingCommunicator(devices.size() > 0);
+    // If the communicator failed to be created or is empty for any participating threads, throw. This should never occur.
+    if (!communicatorCreated || mpi->getParticipatingCommSize() == 0) {
+        THROW exception::EnsembleError("Unable to create MPI communicator. Ensure atleast one GPU is visible.\n");
+    }
+    // If the world size is not the participating size, issue a warning.that too many threads have been used.
+    if (mpi->world_rank == 0 && mpi->world_size != mpi->getParticipatingCommSize() && config.verbosity >= Verbosity::Default) {
+        fprintf(stderr, "Warning: MPI Ensemble launched with %d MPI ranks, but only %d ranks have GPUs assigned. %d ranks are unneccesary.\n", mpi->world_size, mpi->getParticipatingCommSize(), mpi->world_size - mpi->getParticipatingCommSize());
+        fflush(stderr);
+    }
+#endif
+
     const unsigned int TOTAL_RUNNERS = static_cast<unsigned int>(devices.size()) * config.concurrent_runs;
-    detail::SimRunner *runners = static_cast<detail::SimRunner *>(malloc(sizeof(detail::SimRunner) * TOTAL_RUNNERS));
 
     // Log Time (We can't use CUDA events here, due to device resets)
     auto ensemble_timer = detail::SteadyClockTimer();
@@ -176,21 +217,11 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     std::queue<unsigned int> log_export_queue;
     std::mutex log_export_queue_mutex;
     std::condition_variable log_export_queue_cdn;
-    detail::SimRunner::ErrorDetail fast_err_detail = {};
-
-    // Init with placement new
-    {
-        unsigned int i = 0;
-        for (auto &d : devices) {
-            for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
-                new (&runners[i++]) detail::SimRunner(model, err_ct, next_run, plans,
-                    step_log_config, exit_log_config,
-                    d, j,
-                    config.verbosity, config.error_level == EnsembleConfig::Fast,
-                    run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail, TOTAL_RUNNERS, isSWIG);
-            }
-        }
-    }
+#ifdef FLAMEGPU_ENABLE_MPI
+    // In MPI mode, Rank 0 will collect errors from all ranks
+    std::multimap<int, detail::AbstractSimRunner::ErrorDetail> err_detail = {};
+#endif
+    std::vector<detail::AbstractSimRunner::ErrorDetail> err_detail_local = {};
 
     // Init log worker
     detail::SimLogger *log_worker = nullptr;
@@ -199,11 +230,152 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
         step_log_config.get(), exit_log_config.get(), step_log_config && step_log_config->log_timing, exit_log_config && exit_log_config->log_timing);
     }
 
-    // Wait for all runners to exit
-    for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
-        runners[i].thread.join();
-        runners[i].~SimRunner();
+    // In MPI mode, only Rank 0 increments the error counter
+    unsigned int err_count = 0;
+    if (config.mpi) {
+#ifdef FLAMEGPU_ENABLE_MPI
+        // Setup MPISimRunners
+        detail::MPISimRunner** runners = static_cast<detail::MPISimRunner**>(malloc(sizeof(detail::MPISimRunner*) * TOTAL_RUNNERS));
+        std::vector<std::atomic<unsigned int>> err_cts(TOTAL_RUNNERS);
+        std::vector<std::atomic<unsigned int>> next_runs(TOTAL_RUNNERS);
+        for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
+            err_cts[i] = UINT_MAX;
+            next_runs[i] = detail::MPISimRunner::Signal::RequestJob;
+        }
+        {
+            unsigned int i = 0;
+            for (auto& d : devices) {
+                for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
+                    runners[i] = new detail::MPISimRunner(model, err_cts[i], next_runs[i], plans,
+                        step_log_config, exit_log_config,
+                        d, j,
+                        config.verbosity,
+                        run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, err_detail_local, TOTAL_RUNNERS, isSWIG);
+                    runners[i]->start();
+                    ++i;
+                }
+            }
+        }
+        // Wait for runners to request work, then communicate via MPI to get assignments
+        // If work_rank == 0, also perform the assignments
+        if (mpi->world_rank == 0) {
+            unsigned int next_run = 0;
+            MPI_Status status;
+            int flag;
+            int mpi_runners_fin = 1;  // Start at 1 because we have always already finished
+            // Wait for all runs to have been assigned, and all MPI runners to have been notified of fin
+            while (next_run < plans.size() || mpi_runners_fin < mpi->getParticipatingCommSize()) {
+                // Check for errors
+                const int t_err_count = mpi->receiveErrors(err_detail);
+                err_count += t_err_count;
+                if (t_err_count && config.error_level == EnsembleConfig::Fast) {
+                    // Skip to end to kill workers
+                    next_run = plans.size();
+                }
+                // Check whether local runners require a job assignment
+                for (unsigned int i = 0; i < next_runs.size(); ++i) {
+                    auto &r = next_runs[i];
+                    unsigned int run_id = r.load();
+                    if (run_id == detail::MPISimRunner::Signal::RunFailed) {
+                        // Retrieve and handle local error detail
+                        mpi->retrieveLocalErrorDetail(log_export_queue_mutex, err_detail, err_detail_local, i, devices);
+                        ++err_count;
+                        if (config.error_level == EnsembleConfig::Fast) {
+                            // Skip to end to kill workers
+                            next_run = plans.size();
+                        }
+                        run_id = detail::MPISimRunner::Signal::RequestJob;
+                    }
+                    if (run_id == detail::MPISimRunner::Signal::RequestJob) {
+                        r.store(next_run++);
+                        // Print progress to console
+                        if (config.verbosity >= Verbosity::Default && next_run <= plans.size()) {
+                            fprintf(stdout, "MPI ensemble assigned run %d/%u to rank 0\n", next_run, static_cast<unsigned int>(plans.size()));
+                            fflush(stdout);
+                        }
+                    }
+                }
+                // Check whether MPI runners require a job assignment
+                mpi_runners_fin += mpi->receiveJobRequests(next_run);
+                // Yield, rather than hammering the processor
+                std::this_thread::yield();
+            }
+        } else if (mpi->getRankIsParticipating()) {
+            // Wait for all runs to have been assigned, and all MPI runners to have been notified of fin. ranks without GPU(s) do not request jobs.
+            unsigned int next_run = 0;
+            MPI_Status status;
+            while (next_run < plans.size()) {
+                // Check whether local runners require a job assignment
+                for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
+                    unsigned int runner_status = next_runs[i].load();
+                    if (runner_status == detail::MPISimRunner::Signal::RunFailed) {
+                        // Fetch the job id, increment local error counter
+                        const unsigned int failed_run_id = err_cts[i].exchange(UINT_MAX);
+                        ++err_count;
+                        // Retrieve and handle local error detail
+                        mpi->retrieveLocalErrorDetail(log_export_queue_mutex, err_detail, err_detail_local, i, devices);
+                        runner_status = detail::MPISimRunner::Signal::RequestJob;
+                    }
+                    if (runner_status == detail::MPISimRunner::Signal::RequestJob) {
+                        next_run = mpi->requestJob();
+                        // Pass the job to runner that requested it
+                        next_runs[i].store(next_run);
+                        // Break if assigned job is out of range, work is finished
+                        if (next_run >= plans.size()) {
+                            break;
+                        }
+                    }
+                }
+                std::this_thread::yield();
+            }
+        }
+
+        // Notify all local runners to exit
+        for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
+            auto &r = next_runs[i];
+            if (r.exchange(plans.size()) == detail::MPISimRunner::Signal::RunFailed) {
+                ++err_count;
+                // Retrieve and handle local error detail
+                mpi->retrieveLocalErrorDetail(log_export_queue_mutex, err_detail, err_detail_local, i, devices);
+            }
+        }
+        // Wait for all runners to exit
+        for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
+            runners[i]->join();
+            delete runners[i];
+            if (next_runs[i].load() == detail::MPISimRunner::Signal::RunFailed) {
+                ++err_count;
+                // Retrieve and handle local error detail
+                mpi->retrieveLocalErrorDetail(log_export_queue_mutex, err_detail, err_detail_local, i, devices);
+            }
+        }
+#endif
+    } else {
+        detail::SimRunner** runners = static_cast<detail::SimRunner**>(malloc(sizeof(detail::SimRunner*) * TOTAL_RUNNERS));
+        std::atomic<unsigned int> err_ct = { 0u };
+        std::atomic<unsigned int> next_runs = { 0u };
+        // Setup SimRunners
+        {
+            unsigned int i = 0;
+            for (auto& d : devices) {
+                for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
+                    runners[i] = new detail::SimRunner(model, err_ct, next_runs, plans,
+                        step_log_config, exit_log_config,
+                        d, j,
+                        config.verbosity, config.error_level == EnsembleConfig::Fast,
+                        run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, err_detail_local, TOTAL_RUNNERS, isSWIG);
+                    runners[i++]->start();
+                }
+            }
+        }
+        // Wait for all runners to exit
+        for (unsigned int i = 0; i < TOTAL_RUNNERS; ++i) {
+            runners[i]->join();
+            delete runners[i];
+        }
+        err_count = err_ct;
     }
+
     // Notify logger to exit
     if (log_worker) {
         {
@@ -216,32 +388,66 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
         log_worker = nullptr;
     }
 
+#ifdef FLAMEGPU_ENABLE_MPI
+    std::string remote_device_names;
+    if (config.mpi) {
+        // Ensure all workers have finished before exit
+        mpi->worldBarrier();
+        // Check whether MPI runners have reported any final errors
+        err_count += mpi->receiveErrors(err_detail);
+        if (config.telemetry) {
+            // All ranks should notify rank 0 of their GPU devices
+            remote_device_names = mpi->assembleGPUsString();
+        }
+    }
+#endif
     // Record and store the elapsed time
     ensemble_timer.stop();
     ensemble_elapsed_time = ensemble_timer.getElapsedSeconds();
 
     // Ensemble has finished, print summary
-    if (config.verbosity > Verbosity::Quiet) {
-        printf("\rCUDAEnsemble completed %u runs successfully!\n", static_cast<unsigned int>(plans.size() - err_ct));
-        if (err_ct)
-            printf("There were a total of %u errors.\n", err_ct.load());
+    if (config.verbosity > Verbosity::Quiet &&
+#ifdef FLAMEGPU_ENABLE_MPI
+        (!config.mpi || mpi->world_rank == 0) &&
+#endif
+       (config.error_level != EnsembleConfig::Fast || err_count == 0)) {
+        printf("\rCUDAEnsemble completed %u runs successfully!\n", static_cast<unsigned int>(plans.size() - err_count));
+        if (err_count)
+            printf("There were a total of %u errors.\n", err_count);
     }
-    if (config.timing || config.verbosity >= Verbosity::Verbose) {
+    if ((config.timing || config.verbosity >= Verbosity::Verbose) &&
+#ifdef FLAMEGPU_ENABLE_MPI
+    (!config.mpi || mpi->world_rank == 0) &&
+#endif
+       (config.error_level != EnsembleConfig::Fast || err_count == 0)) {
         printf("Ensemble time elapsed: %fs\n", ensemble_elapsed_time);
     }
 
     // Send Telemetry
-    if (config.telemetry) {
+    if (config.telemetry
+#ifdef FLAMEGPU_ENABLE_MPI
+       && (!config.mpi || mpi->world_rank == 0)
+#endif
+    ) {
         // Generate some payload items
         std::map<std::string, std::string> payload_items;
+#ifndef FLAMEGPU_ENABLE_MPI
         payload_items["GPUDevices"] = flamegpu::detail::compute_capability::getDeviceNames(config.devices);
+#else
+        payload_items["GPUDevices"] = flamegpu::detail::compute_capability::getDeviceNames(config.devices) + remote_device_names;
+#endif
         payload_items["SimTime(s)"] = std::to_string(ensemble_elapsed_time);
-        #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__)
-            payload_items["NVCCVersion"] = std::to_string(__CUDACC_VER_MAJOR__) + "." + std::to_string(__CUDACC_VER_MINOR__) + "." + std::to_string(__CUDACC_VER_BUILD__);
-        #endif
+#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__)
+        payload_items["NVCCVersion"] = std::to_string(__CUDACC_VER_MAJOR__) + "." + std::to_string(__CUDACC_VER_MINOR__) + "." + std::to_string(__CUDACC_VER_BUILD__);
+#endif
         // Add the ensemble size to the ensemble telemetry payload
         payload_items["PlansSize"] = std::to_string(plans.size());
         payload_items["ConcurrentRuns"] = std::to_string(config.concurrent_runs);
+        // Add MPI details to the ensemble telemetry payload
+        payload_items["mpi"] = config.mpi ? "true" : "false";
+#ifdef FLAMEGPU_ENABLE_MPI
+        payload_items["mpi_world_size"] = std::to_string(mpi->world_size);
+#endif
         // generate telemetry data
         std::string telemetry_data = flamegpu::io::Telemetry::generateData("ensemble-run", payload_items, isSWIG);
         // send the telemetry packet
@@ -256,19 +462,35 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
         }
     } else {
         // Encourage users who have opted out to opt back in, unless suppressed.
-        if ((config.verbosity > Verbosity::Quiet)) {
+        if ((config.verbosity > Verbosity::Quiet)
+#ifdef FLAMEGPU_ENABLE_MPI
+            && (!config.mpi || mpi->world_rank == 0)
+#endif
+        ) {
             flamegpu::io::Telemetry::encourageUsage();
         }
     }
 
-    // Free memory
-    free(runners);
+#ifdef FLAMEGPU_ENABLE_MPI
+    if (config.mpi && mpi->world_rank != 0) {
+        // All errors are reported via rank 0
+        err_count = 0;
+    }
+#endif
 
-    if (config.error_level == EnsembleConfig::Fast && err_ct.load()) {
+    if (config.error_level == EnsembleConfig::Fast && err_count) {
+        if (config.mpi) {
+#ifdef FLAMEGPU_ENABLE_MPI
+            for (const auto &e : err_detail) {
+                THROW exception::EnsembleError("Run %u failed on rank %d, device %d, thread %u with exception: \n%s\n",
+                    e.second.run_id, e.first, e.second.device_id, e.second.runner_id, e.second.exception_string);
+            }
+#endif
+        }
         THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n",
-            fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str());
-    } else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) {
-        THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast<unsigned int>(plans.size()));
+            err_detail_local[0].run_id, err_detail_local[0].device_id, err_detail_local[0].runner_id, err_detail_local[0].exception_string);
+    } else if (config.error_level == EnsembleConfig::Slow && err_count) {
+        THROW exception::EnsembleError("%u/%u runs failed!\n.", err_count, static_cast<unsigned int>(plans.size()));
     }
 #ifdef _MSC_VER
     if (config.block_standby) {
@@ -277,7 +499,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     }
 #endif
 
-    return err_ct.load();
+    return err_count;
 }
 
 void CUDAEnsemble::initialise(int argc, const char** argv) {
@@ -420,7 +642,7 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) {
 #endif
             continue;
         }
-        // Warning if not in QUIET verbosity or if silnce-unknown-args is set
+        // Warning if not in QUIET verbosity or if silence-unknown-args is set
         if (!(config.verbosity == flamegpu::Verbosity::Quiet || config.silence_unknown_args))
             fprintf(stderr, "Warning: Unknown argument '%s' passed to Ensemble will be ignored\n", arg.c_str());
     }
@@ -441,11 +663,11 @@ void CUDAEnsemble::printHelp(const char *executable) {
     printf(line_fmt, "-v, --verbose", "Print config, progress and timing (-t) information to console");
     printf(line_fmt, "-t, --timing", "Output timing information to stdout");
     printf(line_fmt, "-e, --error <error level>", "The error level 0, 1, 2, off, slow or fast");
+    printf(line_fmt, "", "By default, \"slow\" will be used.");
     printf(line_fmt, "-u, --silence-unknown-args", "Silence warnings for unknown arguments passed after this flag.");
 #ifdef _MSC_VER
     printf(line_fmt, "    --standby", "Allow the machine to enter standby during execution");
 #endif
-    printf(line_fmt, "", "By default, \"slow\" will be used.");
 }
 void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) {
     // Validate ModelDescription matches
@@ -463,8 +685,7 @@ void CUDAEnsemble::setExitLog(const LoggingConfig &exitConfig) {
     // Set internal config
     exit_log_config = std::make_shared<LoggingConfig>(exitConfig);
 }
-const std::vector<RunLog> &CUDAEnsemble::getLogs() {
+const std::map<unsigned int, RunLog> &CUDAEnsemble::getLogs() {
     return run_logs;
 }
-
 }  // namespace flamegpu
diff --git a/src/flamegpu/simulation/RunPlan.cpp b/src/flamegpu/simulation/RunPlan.cpp
index c0af8ef17..38c36354d 100644
--- a/src/flamegpu/simulation/RunPlan.cpp
+++ b/src/flamegpu/simulation/RunPlan.cpp
@@ -30,7 +30,7 @@ void RunPlan::setRandomSimulationSeed(const uint64_t _random_seed) {
 }
 void RunPlan::setSteps(const unsigned int _steps) {
     if (_steps == 0 && !allow_0_steps) {
-        throw exception::OutOfBoundsException("Model description requires atleast 1 exit condition to have unlimited steps, "
+        THROW exception::OutOfBoundsException("Model description requires atleast 1 exit condition to have unlimited steps, "
             "in RunPlan::setSteps()");
     }
     steps = _steps;
diff --git a/src/flamegpu/simulation/RunPlanVector.cpp b/src/flamegpu/simulation/RunPlanVector.cpp
index 522a90885..20e2b5d5f 100644
--- a/src/flamegpu/simulation/RunPlanVector.cpp
+++ b/src/flamegpu/simulation/RunPlanVector.cpp
@@ -27,7 +27,7 @@ void RunPlanVector::setRandomSimulationSeed(const uint64_t initial_seed, const u
 }
 void RunPlanVector::setSteps(const unsigned int steps) {
     if (steps == 0 && !allow_0_steps) {
-        throw exception::OutOfBoundsException("Model description requires atleast 1 exit condition to have unlimited steps, "
+        THROW exception::OutOfBoundsException("Model description requires atleast 1 exit condition to have unlimited steps, "
             "in RunPlanVector::setSteps()");
     }
     for (auto &i : *this) {
diff --git a/src/flamegpu/simulation/detail/AbstractSimRunner.cu b/src/flamegpu/simulation/detail/AbstractSimRunner.cu
new file mode 100644
index 000000000..a859a823d
--- /dev/null
+++ b/src/flamegpu/simulation/detail/AbstractSimRunner.cu
@@ -0,0 +1,114 @@
+#include "flamegpu/simulation/detail/AbstractSimRunner.h"
+
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#include "flamegpu/model/ModelData.h"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/RunPlanVector.h"
+
+namespace flamegpu {
+namespace detail {
+
+AbstractSimRunner::AbstractSimRunner(const std::shared_ptr<const ModelData> _model,
+    std::atomic<unsigned int> &_err_ct,
+    std::atomic<unsigned int> &_next_run,
+    const RunPlanVector &_plans,
+    std::shared_ptr<const StepLoggingConfig> _step_log_config,
+    std::shared_ptr<const LoggingConfig> _exit_log_config,
+    int _device_id,
+    unsigned int _runner_id,
+    flamegpu::Verbosity _verbosity,
+    std::map<unsigned int, RunLog> &_run_logs,
+    std::queue<unsigned int> &_log_export_queue,
+    std::mutex &_log_export_queue_mutex,
+    std::condition_variable &_log_export_queue_cdn,
+    std::vector<ErrorDetail> &_err_detail,
+    const unsigned int _total_runners,
+    bool _isSWIG)
+      : model(_model->clone())
+      , device_id(_device_id)
+      , runner_id(_runner_id)
+      , total_runners(_total_runners)
+      , verbosity(_verbosity)
+      , err_ct(_err_ct)
+      , next_run(_next_run)
+      , plans(_plans)
+      , step_log_config(std::move(_step_log_config))
+      , exit_log_config(std::move(_exit_log_config))
+      , run_logs(_run_logs)
+      , log_export_queue(_log_export_queue)
+      , log_export_queue_mutex(_log_export_queue_mutex)
+      , log_export_queue_cdn(_log_export_queue_cdn)
+      , err_detail(_err_detail)
+      , isSWIG(_isSWIG) {
+}
+void AbstractSimRunner::start() {
+    this->thread = std::thread(&AbstractSimRunner::main, this);
+    // Attempt to name the thread
+#ifdef _MSC_VER
+    std::wstringstream thread_name;
+    thread_name << L"CUDASim D" << device_id << L"T" << runner_id;
+    // HRESULT hr =
+    SetThreadDescription(this->thread.native_handle(), thread_name.str().c_str());
+    // if (FAILED(hr)) {
+    //     fprintf(stderr, "Failed to name thread 'CUDASim D%dT%u'\n", device_id, runner_id);
+    // }
+#else
+    std::stringstream thread_name;
+    thread_name << "CUDASim D" << device_id << "T" << runner_id;
+    // int hr =
+    pthread_setname_np(this->thread.native_handle(), thread_name.str().c_str());
+    // if (hr) {
+    //     fprintf(stderr, "Failed to name thread 'CUDASim D%dT%u'\n", device_id, runner_id);
+    // }
+#endif
+}
+void AbstractSimRunner::join() {
+    if (this->thread.joinable()) {
+        this->thread.join();
+    }
+}
+
+void AbstractSimRunner::runSimulation(int plan_id) {
+    // Update environment (this might be worth moving into CUDASimulation)
+    auto &prop_map = model->environment->properties;
+    for (auto &ovrd : plans[plan_id].property_overrides) {
+        auto &prop = prop_map.at(ovrd.first);
+        memcpy(prop.data.ptr, ovrd.second.ptr, prop.data.length);
+    }
+    // Set simulation device
+    std::unique_ptr<CUDASimulation> simulation = std::unique_ptr<CUDASimulation>(new CUDASimulation(model, isSWIG));
+    // Copy steps and seed from runplan
+    simulation->SimulationConfig().steps = plans[plan_id].getSteps();
+    simulation->SimulationConfig().random_seed = plans[plan_id].getRandomSimulationSeed();
+    simulation->SimulationConfig().verbosity = Verbosity::Default;
+    if (verbosity == Verbosity::Quiet)  // Use quiet verbosity for sims if set in ensemble but never verbose
+        simulation->SimulationConfig().verbosity = Verbosity::Quiet;
+    simulation->SimulationConfig().telemetry = false;   // Never any telemtry for individual runs inside an ensemble
+    simulation->SimulationConfig().timing = false;
+    simulation->CUDAConfig().device_id = this->device_id;
+    simulation->CUDAConfig().is_ensemble = true;
+    simulation->CUDAConfig().ensemble_run_id = plan_id;
+    simulation->applyConfig();
+    // Set the step config directly, to bypass validation
+    simulation->step_log_config = step_log_config;
+    simulation->exit_log_config = exit_log_config;
+    // Don't need to set pop, this must be done via init function within ensembles
+    // Execute simulation
+    simulation->simulate();
+    {
+        std::lock_guard<std::mutex> lck(log_export_queue_mutex);
+        // Store results in run_log
+        run_logs.emplace(plan_id, simulation->getRunLog());
+        // Notify logger
+        log_export_queue.push(plan_id);
+    }
+    log_export_queue_cdn.notify_one();
+}
+
+}  // namespace detail
+}  // namespace flamegpu
diff --git a/src/flamegpu/simulation/detail/MPIEnsemble.cu b/src/flamegpu/simulation/detail/MPIEnsemble.cu
new file mode 100644
index 000000000..941100e60
--- /dev/null
+++ b/src/flamegpu/simulation/detail/MPIEnsemble.cu
@@ -0,0 +1,348 @@
+#ifdef FLAMEGPU_ENABLE_MPI
+#include "flamegpu/simulation/detail/MPIEnsemble.h"
+
+#include "flamegpu/detail/compute_capability.cuh"
+
+namespace flamegpu {
+namespace detail {
+
+MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsigned int _total_runs)
+    : config(_config)
+    , world_rank(queryMPIWorldRank())
+    , world_size(queryMPIWorldSize())
+    , local_rank(queryMPISharedGroupRank())
+    , local_size(queryMPISharedGroupSize())
+    , total_runs(_total_runs)
+    , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype())
+    , rank_is_participating(false)
+    , comm_participating(MPI_COMM_NULL)
+    , participating_size(0)
+    , participating_rank(-1) { }
+
+int MPIEnsemble::receiveErrors(std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail) {
+    int errCount = 0;
+    if (world_rank == 0) {
+        MPI_Status status;
+        int flag;
+        // Check whether MPI runners have reported an error
+        MPI_Iprobe(
+            MPI_ANY_SOURCE,            // int source
+            EnvelopeTag::ReportError,  // int tag
+            MPI_COMM_WORLD,            // MPI_Comm communicator
+            &flag,                     // int flag
+            &status);
+        while (flag) {
+            // Receive the message
+            memset(&status, 0, sizeof(MPI_Status));
+            AbstractSimRunner::ErrorDetail e_detail;
+            memset(&e_detail, 0, sizeof(AbstractSimRunner::ErrorDetail));
+            MPI_Recv(
+                &e_detail,                 // void* data
+                1,                         // int count
+                MPI_ERROR_DETAIL,          // MPI_Datatype datatype (can't use MPI_DATATYPE_NULL)
+                MPI_ANY_SOURCE,            // int source
+                EnvelopeTag::ReportError,  // int tag
+                MPI_COMM_WORLD,            // MPI_Comm communicator
+                &status);                  // MPI_Status*
+            err_detail.emplace(status.MPI_SOURCE, e_detail);
+            ++errCount;
+            // Progress flush
+            if (config.verbosity >= Verbosity::Default && config.error_level != CUDAEnsemble::EnsembleConfig::Fast) {
+                fprintf(stderr, "Warning: Run %u/%u failed on rank %d, device %d, thread %u with exception: \n%s\n",
+                    e_detail.run_id + 1, total_runs, status.MPI_SOURCE, e_detail.device_id, e_detail.runner_id, e_detail.exception_string);
+                fflush(stderr);
+            }
+            // Check again
+            MPI_Iprobe(MPI_ANY_SOURCE, EnvelopeTag::ReportError, MPI_COMM_WORLD, &flag, &status);
+        }
+    }
+    return errCount;
+}
+int MPIEnsemble::receiveJobRequests(unsigned int &next_run) {
+    int mpi_runners_fin = 0;
+    if (world_rank == 0) {
+        MPI_Status status;
+        int flag;
+        MPI_Iprobe(
+            MPI_ANY_SOURCE,           // int source
+            EnvelopeTag::RequestJob,  // int tag
+            MPI_COMM_WORLD,           // MPI_Comm communicator
+            &flag,                    // int flag
+            &status);                 // MPI_Status*
+        while (flag) {
+            // Receive the message (kind of redundant as we already have the status and it carrys no data)
+            memset(&status, 0, sizeof(MPI_Status));
+            MPI_Recv(
+                nullptr,                  // void* data
+                0,                        // int count
+                MPI_CHAR,                 // MPI_Datatype datatype (can't use MPI_DATATYPE_NULL)
+                MPI_ANY_SOURCE,           // int source
+                EnvelopeTag::RequestJob,  // int tag
+                MPI_COMM_WORLD,           // MPI_Comm communicator
+                &status);                 // MPI_Status*
+            // Respond to the sender with a job assignment
+            MPI_Send(
+                &next_run,               // void* data
+                1,                       // int count
+                MPI_UNSIGNED,            // MPI_Datatype datatype
+                status.MPI_SOURCE,       // int destination
+                EnvelopeTag::AssignJob,  // int tag
+                MPI_COMM_WORLD);         // MPI_Comm communicator
+            if (next_run >= total_runs) ++mpi_runners_fin;
+            ++next_run;
+            // Print progress to console
+            if (config.verbosity >= Verbosity::Default && next_run <= total_runs) {
+                fprintf(stdout, "MPI ensemble assigned run %d/%u to rank %d\n", next_run, total_runs, status.MPI_SOURCE);
+                fflush(stdout);
+            }
+            // Check again
+            MPI_Iprobe(MPI_ANY_SOURCE, EnvelopeTag::RequestJob, MPI_COMM_WORLD, &flag, &status);
+        }
+    }
+    return mpi_runners_fin;
+}
+void MPIEnsemble::sendErrorDetail(AbstractSimRunner::ErrorDetail &e_detail) {
+    if (world_rank != 0) {
+      MPI_Send(
+          &e_detail,                  // void* data
+          1,                         // int count
+          MPI_ERROR_DETAIL,          // MPI_Datatype datatype (can't use MPI_DATATYPE_NULL)
+          0,                         // int destination
+          EnvelopeTag::ReportError,  // int tag
+          MPI_COMM_WORLD);           // MPI_Comm communicator
+    }
+}
+int MPIEnsemble::requestJob() {
+    unsigned int next_run = UINT_MAX;
+    if (world_rank != 0) {
+        // Send a job request to 0, these have no data
+        MPI_Send(
+            nullptr,                  // void* data
+            0,                        // int count
+            MPI_CHAR,                 // MPI_Datatype datatype (can't use MPI_DATATYPE_NULL)
+            0,                        // int destination
+            EnvelopeTag::RequestJob,  // int tag
+            MPI_COMM_WORLD);          // MPI_Comm communicator
+        // Wait for a job assignment from 0
+        MPI_Status status;
+        memset(&status, 0, sizeof(MPI_Status));
+        MPI_Recv(
+            &next_run,               // void* data
+            1,                       // int count
+            MPI_UNSIGNED,            // MPI_Datatype datatype
+            0,                       // int source
+            EnvelopeTag::AssignJob,  // int tag
+            MPI_COMM_WORLD,          // MPI_Comm communicator
+            &status);                // MPI_Status* status
+    }
+    return next_run;
+}
+void MPIEnsemble::worldBarrier() {
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+std::string MPIEnsemble::assembleGPUsString() {
+    std::string remote_device_names;
+    // One rank per node should notify rank 0 of their GPU devices. other ranks will send an empty message.
+    if (world_rank == 0) {
+        int bufflen = 256;  // Length of name string in cudaDeviceProp
+        char *buff = static_cast<char*>(malloc(bufflen));
+        for (int i = 1; i < world_size; ++i) {
+            // Receive a message from each rank
+            MPI_Status status;
+            memset(&status, 0, sizeof(MPI_Status));
+            MPI_Probe(
+                MPI_ANY_SOURCE,            // int source
+                EnvelopeTag::TelemetryDevices,  // int tag
+                MPI_COMM_WORLD,            // MPI_Comm communicator
+                &status);
+            int strlen = 0;
+            // Ensure our receive buffer is long enough
+            MPI_Get_count(&status, MPI_CHAR, &strlen);
+            if (strlen > bufflen) {
+                free(buff);
+                buff = static_cast<char*>(malloc(strlen));
+            }
+            MPI_Recv(
+                buff,                           // void* data
+                strlen,                         // int count
+                MPI_CHAR,                       // MPI_Datatype datatype (can't use MPI_DATATYPE_NULL)
+                MPI_ANY_SOURCE,                 // int source
+                EnvelopeTag::TelemetryDevices,  // int tag
+                MPI_COMM_WORLD,                 // MPI_Comm communicator
+                &status);                       // MPI_Status*
+            if (strlen > 1) {
+                remote_device_names.append(", ");
+                remote_device_names.append(buff);
+            }
+        }
+        free(buff);
+    } else {
+        const std::string d_string = local_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
+        // Send GPU count
+        MPI_Send(
+            d_string.c_str(),               // void* data
+            d_string.length() + 1,          // int count
+            MPI_CHAR,                       // MPI_Datatype datatype
+            0,                              // int destination
+            EnvelopeTag::TelemetryDevices,  // int tag
+            MPI_COMM_WORLD);                // MPI_Comm communicator
+    }
+    worldBarrier();
+    return remote_device_names;
+}
+
+int MPIEnsemble::queryMPIWorldRank() {
+    initMPI();
+    int world_rank = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+    return world_rank;
+}
+int MPIEnsemble::queryMPIWorldSize() {
+    initMPI();
+    int world_size = -1;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    return world_size;
+}
+
+int MPIEnsemble::queryMPISharedGroupRank() {
+    initMPI();
+    int local_rank = -1;
+    MPI_Comm group;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
+    MPI_Comm_rank(group, &local_rank);
+    return local_rank;
+}
+
+int MPIEnsemble::queryMPISharedGroupSize() {
+    initMPI();
+    int local_size = -1;
+    MPI_Comm group;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
+    MPI_Comm_size(group, &local_size);
+    return local_size;
+}
+
+void MPIEnsemble::initMPI() {
+    int flag = 0;
+    // MPI can only be init once, for certain test cases we do some initial MPI comms for setup
+    MPI_Initialized(&flag);
+    if (!flag) {
+        // Init MPI, fetch rank and size
+        int thread_provided = 0;
+        // MPI single means that only the main thread will perform MPI actions
+        MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &thread_provided);
+        if (thread_provided != MPI_THREAD_SINGLE) {
+            THROW exception::UnknownInternalError("MPI unable to provide MPI_THREAD_SINGLE support");
+        }
+    }
+}
+unsigned int MPIEnsemble::getDeviceIndex(const int j, const std::set<int> devices) {
+    int i = 0;
+    for (auto& d : devices) {
+        if (i++ == j)
+            return d;
+    }
+    return j;  // If set is empty, then direct index should be used
+}
+void MPIEnsemble::retrieveLocalErrorDetail(std::mutex &log_export_queue_mutex, std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail,
+std::vector<detail::AbstractSimRunner::ErrorDetail> &err_detail_local, const int i, std::set<int> devices) {
+    // Fetch error detail
+    detail::AbstractSimRunner::ErrorDetail e_detail;
+    {
+        // log_export_mutex is treated as our protection for race conditions on err_detail
+        std::lock_guard<std::mutex> lck(log_export_queue_mutex);
+        // Fetch corresponding error detail
+        bool success = false;
+        const unsigned int t_device_id = getDeviceIndex(i / config.concurrent_runs, devices);
+        const unsigned int t_runner_id = i % config.concurrent_runs;
+        for (auto it = err_detail_local.begin(); it != err_detail_local.end(); ++it) {
+            if (it->runner_id == t_runner_id && it->device_id == t_device_id) {
+                e_detail = *it;
+                if (world_rank == 0) {
+                    // Only rank 0 collects error details
+                    err_detail.emplace(world_rank, e_detail);
+                } else {
+                //   fprintf(stderr, "[%d] Purged error  from device %u runner %u\n", world_rank, t_device_id, t_runner_id);  // useful debug, breaks tests
+                }
+                err_detail_local.erase(it);
+                success = true;
+                break;
+            }
+        }
+        if (!success) {
+            THROW exception::UnknownInternalError("[%d] Management thread failed to locate reported error from device %u runner %u from %u errors, in CUDAEnsemble::simulate()", world_rank, t_device_id, t_runner_id, static_cast<unsigned int>(err_detail_local.size()));
+        }
+    }
+    if (world_rank == 0) {
+        // Progress flush
+        if (config.verbosity >= Verbosity::Default && config.error_level != CUDAEnsemble::EnsembleConfig::Fast) {
+            fprintf(stderr, "Warning: Run %u/%u failed on rank %d, device %d, thread %u with exception: \n%s\n",
+                e_detail.run_id + 1, total_runs, world_rank, e_detail.device_id, e_detail.runner_id, e_detail.exception_string);
+            fflush(stderr);
+        }
+    } else {
+        // Notify 0 that an error occurred, with the error detail
+        sendErrorDetail(e_detail);
+    }
+}
+
+bool MPIEnsemble::createParticipatingCommunicator(const bool isParticipating) {
+    // If the communicator has not yet been created, create it and get the rank and size.
+    if (this->comm_participating == MPI_COMM_NULL) {
+        // determine if this thread is participating or not, i..e. the colour of the rank
+        this->rank_is_participating = isParticipating;
+        // Split the world  communicator, if the split fails, abort (this makes the return type not useful tbh.)
+        if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) {
+            fprintf(stderr, "Error creating communicator\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+            return false;
+        }
+        // Get the size of the split pariticpating communicator
+        MPI_Comm_size(this->comm_participating, &this->participating_size);
+        // Get the local rank within the split communicator
+        MPI_Comm_rank(this->comm_participating, &this->participating_rank);
+    }
+    return true;
+}
+
+std::set<int> MPIEnsemble::devicesForThisRank(const std::set<int> devicesToSelectFrom, int local_size, int local_rank) {
+    // create a vector from teh set to enable direct access.
+    std::vector<int> devicesToSelectFromVector = std::vector<int>(devicesToSelectFrom.begin(), devicesToSelectFrom.end());
+    int device_count = static_cast<int>(devicesToSelectFrom.size());
+    // if there is only a single mpi rank on this shared memory system, assign all devices, or if there are no devices to select from
+    if (local_size == 1 || device_count == 0) {
+        return devicesToSelectFrom;
+    } else if (local_size > 1 && local_size <= device_count) {
+        // Otherwise, if there are more than one rank per node, but fewer ranks than gpus, attempt to load balance
+        std::set<int> devices;
+        // find the balanced number of gpus per rank, and how many will need +1
+        int gpusPerRank = device_count / local_size;
+        int unallocated = device_count - (gpusPerRank * local_size);
+        // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
+        int lowerRanksWithPlus1 = local_rank < unallocated ? local_rank : unallocated;
+        int lowerRanksWithPlus0 = std::max(0, local_rank - unallocated);
+        int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerRanksWithPlus0 * gpusPerRank);
+        int last = local_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
+        // Assign the devices for this rank
+        for (int i = first; i < last; i++) {
+            devices.emplace(devicesToSelectFromVector[i]);
+        }
+        return devices;
+    } else {
+        // Otherwise, there are more ranks than gpus, so use upto one gpu per rank.
+        std::set<int> devices;
+        for (const auto & d : devicesToSelectFromVector) {}
+        if (local_rank < device_count) {
+            devices.emplace(local_rank);
+        }
+        return devices;
+    }
+}
+
+std::set<int> MPIEnsemble::devicesForThisRank(const std::set<int> devicesToSelectFrom) {
+    return MPIEnsemble::devicesForThisRank(devicesToSelectFrom, this->local_size, this->local_rank);
+}
+
+}  // namespace detail
+}  // namespace flamegpu
+#endif
diff --git a/src/flamegpu/simulation/detail/MPISimRunner.cu b/src/flamegpu/simulation/detail/MPISimRunner.cu
new file mode 100644
index 000000000..164b2988d
--- /dev/null
+++ b/src/flamegpu/simulation/detail/MPISimRunner.cu
@@ -0,0 +1,88 @@
+#include "flamegpu/simulation/detail/MPISimRunner.h"
+
+#include <utility>
+
+#include "flamegpu/model/ModelData.h"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/RunPlanVector.h"
+
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+namespace flamegpu {
+namespace detail {
+
+MPISimRunner::MPISimRunner(const std::shared_ptr<const ModelData> _model,
+    std::atomic<unsigned int>& _err_ct,
+    std::atomic<unsigned int>& _next_run,
+    const RunPlanVector& _plans,
+    std::shared_ptr<const StepLoggingConfig> _step_log_config,
+    std::shared_ptr<const LoggingConfig> _exit_log_config,
+    int _device_id,
+    unsigned int _runner_id,
+    flamegpu::Verbosity _verbosity,
+    std::map<unsigned int, RunLog>& _run_logs,
+    std::queue<unsigned int>& _log_export_queue,
+    std::mutex& _log_export_queue_mutex,
+    std::condition_variable& _log_export_queue_cdn,
+    std::vector<ErrorDetail>& _err_detail_local,
+    const unsigned int _total_runners,
+    bool _isSWIG)
+    : AbstractSimRunner(
+        _model,
+        _err_ct,
+        _next_run,
+        _plans,
+        _step_log_config,
+        _exit_log_config,
+        _device_id,
+        _runner_id,
+        _verbosity,
+        _run_logs,
+        _log_export_queue,
+        _log_export_queue_mutex,
+        _log_export_queue_cdn,
+        _err_detail_local,
+        _total_runners,
+        _isSWIG)
+    { }
+
+void MPISimRunner::main() {
+    // While there are still plans to process
+    while (true) {
+        const unsigned int run_id = next_run.load();
+        if (run_id < plans.size()) {
+            // Process the assigned job
+            try {
+                runSimulation(run_id);
+                if (next_run.exchange(Signal::RequestJob) >= plans.size()) {
+                    break;
+                }
+                // MPI Worker's don't print progress
+            } catch(std::exception &e) {
+                // log_export_mutex is treated as our protection for race conditions on err_detail
+                std::lock_guard<std::mutex> lck(log_export_queue_mutex);
+                // Build the error detail (fixed len char array for string)
+                // fprintf(stderr, "Fail: run: %u device: %u, runner: %u\n", run_id, device_id, runner_id);  // useful debug, breaks tests
+                err_detail.push_back(ErrorDetail{run_id, static_cast<unsigned int>(device_id), runner_id, });
+                strncpy(err_detail.back().exception_string, e.what(), sizeof(ErrorDetail::exception_string)-1);
+                err_detail.back().exception_string[sizeof(ErrorDetail::exception_string) - 1] = '\0';
+                err_ct.store(static_cast<int>(err_detail.size()));
+                // Need to notify manager that run failed
+                if (next_run.exchange(Signal::RunFailed) >= plans.size()) {
+                    break;
+                }
+            }
+        } else if (run_id == Signal::RequestJob || run_id == Signal::RunFailed) {
+            std::this_thread::yield();
+        } else {
+            break;
+        }
+    }
+}
+
+}  // namespace detail
+}  // namespace flamegpu
diff --git a/src/flamegpu/simulation/detail/SimLogger.cu b/src/flamegpu/simulation/detail/SimLogger.cu
index 099386d07..0498d633b 100644
--- a/src/flamegpu/simulation/detail/SimLogger.cu
+++ b/src/flamegpu/simulation/detail/SimLogger.cu
@@ -14,7 +14,7 @@
 namespace flamegpu {
 namespace detail {
 
-SimLogger::SimLogger(const std::vector<RunLog> &_run_logs,
+SimLogger::SimLogger(const std::map<unsigned int, RunLog> &_run_logs,
         const RunPlanVector &_run_plans,
         const std::string &_out_directory,
         const std::string &_out_format,
@@ -77,12 +77,12 @@ void SimLogger::start() {
             if (export_exit) {
                 const std::filesystem::path exit_path = p_out_directory / std::filesystem::path(run_plans[target_log].getOutputSubdirectory()) / std::filesystem::path("exit." + out_format);
                 const auto exit_logger = io::LoggerFactory::createLogger(exit_path.generic_string(), false, false);
-                exit_logger->log(run_logs[target_log], run_plans[target_log], false, true, false, export_exit_time);
+                exit_logger->log(run_logs.at(target_log), run_plans[target_log], false, true, false, export_exit_time);
             }
             if (export_step) {
                 const std::filesystem::path step_path = p_out_directory/std::filesystem::path(run_plans[target_log].getOutputSubdirectory())/std::filesystem::path(std::to_string(target_log)+"."+out_format);
                 const auto step_logger = io::LoggerFactory::createLogger(step_path.generic_string(), false, true);
-                step_logger->log(run_logs[target_log], run_plans[target_log], true, false, export_step_time, false);
+                step_logger->log(run_logs.at(target_log), run_plans[target_log], true, false, export_step_time, false);
             }
             // Continue
             ++logs_processed;
diff --git a/src/flamegpu/simulation/detail/SimRunner.cu b/src/flamegpu/simulation/detail/SimRunner.cu
index b6e421748..98f9f0740 100644
--- a/src/flamegpu/simulation/detail/SimRunner.cu
+++ b/src/flamegpu/simulation/detail/SimRunner.cu
@@ -1,17 +1,7 @@
 #include "flamegpu/simulation/detail/SimRunner.h"
 
-#include <utility>
-
-#include "flamegpu/model/ModelData.h"
-#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/simulation/RunPlanVector.h"
 
-#ifdef _MSC_VER
-#include <windows.h>
-#else
-#include <pthread.h>
-#endif
-
 namespace flamegpu {
 namespace detail {
 
@@ -25,91 +15,39 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
     unsigned int _runner_id,
     flamegpu::Verbosity _verbosity,
     bool _fail_fast,
-    std::vector<RunLog> &_run_logs,
+    std::map<unsigned int, RunLog> &_run_logs,
     std::queue<unsigned int> &_log_export_queue,
     std::mutex &_log_export_queue_mutex,
     std::condition_variable &_log_export_queue_cdn,
-    ErrorDetail &_fast_err_detail,
+    std::vector<ErrorDetail> &_err_detail,
     const unsigned int _total_runners,
     bool _isSWIG)
-      : model(_model->clone())
-      , run_id(0)
-      , device_id(_device_id)
-      , runner_id(_runner_id)
-      , total_runners(_total_runners)
-      , verbosity(_verbosity)
-      , fail_fast(_fail_fast)
-      , err_ct(_err_ct)
-      , next_run(_next_run)
-      , plans(_plans)
-      , step_log_config(std::move(_step_log_config))
-      , exit_log_config(std::move(_exit_log_config))
-      , run_logs(_run_logs)
-      , log_export_queue(_log_export_queue)
-      , log_export_queue_mutex(_log_export_queue_mutex)
-      , log_export_queue_cdn(_log_export_queue_cdn)
-      , fast_err_detail(_fast_err_detail)
-      , isSWIG(_isSWIG) {
-    this->thread = std::thread(&SimRunner::start, this);
-    // Attempt to name the thread
-#ifdef _MSC_VER
-    std::wstringstream thread_name;
-    thread_name << L"CUDASim D" << device_id << L"T" << runner_id;
-    // HRESULT hr =
-    SetThreadDescription(this->thread.native_handle(), thread_name.str().c_str());
-    // if (FAILED(hr)) {
-    //     fprintf(stderr, "Failed to name thread 'CUDASim D%dT%u'\n", device_id, runner_id);
-    // }
-#else
-    std::stringstream thread_name;
-    thread_name << "CUDASim D" << device_id << "T" << runner_id;
-    // int hr =
-    pthread_setname_np(this->thread.native_handle(), thread_name.str().c_str());
-    // if (hr) {
-    //     fprintf(stderr, "Failed to name thread 'CUDASim D%dT%u'\n", device_id, runner_id);
-    // }
-#endif
-}
+    : AbstractSimRunner(
+        _model,
+        _err_ct,
+        _next_run,
+        _plans,
+        _step_log_config,
+        _exit_log_config,
+        _device_id,
+        _runner_id,
+        _verbosity,
+        _run_logs,
+        _log_export_queue,
+        _log_export_queue_mutex,
+        _log_export_queue_cdn,
+        _err_detail,
+        _total_runners,
+        _isSWIG)
+    , fail_fast(_fail_fast) { }
 
 
-void SimRunner::start() {
+void SimRunner::main() {
+    unsigned int run_id = 0;
     // While there are still plans to process
-    while ((this->run_id = next_run++) < plans.size()) {
+    while ((run_id = next_run++) < plans.size()) {
         try {
-            // Update environment (this might be worth moving into CUDASimulation)
-            auto &prop_map = model->environment->properties;
-            for (auto &ovrd : plans[run_id].property_overrides) {
-                auto &prop = prop_map.at(ovrd.first);
-                memcpy(prop.data.ptr, ovrd.second.ptr, prop.data.length);
-            }
-            // Set simulation device
-            std::unique_ptr<CUDASimulation> simulation = std::unique_ptr<CUDASimulation>(new CUDASimulation(model, isSWIG));
-            // Copy steps and seed from runplan
-            simulation->SimulationConfig().steps = plans[run_id].getSteps();
-            simulation->SimulationConfig().random_seed = plans[run_id].getRandomSimulationSeed();
-            simulation->SimulationConfig().verbosity = Verbosity::Default;
-            if (verbosity == Verbosity::Quiet)  // Use quiet verbosity for sims if set in ensemble but never verbose
-                simulation->SimulationConfig().verbosity = Verbosity::Quiet;
-            simulation->SimulationConfig().telemetry = false;   // Never any telemtry for indiviual runs inside an ensemble
-            simulation->SimulationConfig().timing = false;
-            simulation->CUDAConfig().device_id = this->device_id;
-            simulation->CUDAConfig().is_ensemble = true;
-            simulation->CUDAConfig().ensemble_run_id = run_id;
-            simulation->applyConfig();
-            // Set the step config directly, to bypass validation
-            simulation->step_log_config = step_log_config;
-            simulation->exit_log_config = exit_log_config;
-            // TODO Set population?
-            // Execute simulation
-            simulation->simulate();
-            // Store results in run_log (use placement new because const members)
-            run_logs[this->run_id] = simulation->getRunLog();
-            // Notify logger
-            {
-                std::lock_guard<std::mutex> lck(log_export_queue_mutex);
-                log_export_queue.push(this->run_id);
-            }
-            log_export_queue_cdn.notify_one();
+            runSimulation(run_id);
             // Print progress to console
             if (verbosity >= Verbosity::Default) {
                 const int progress = static_cast<int>(next_run.load()) - static_cast<int>(total_runners) + 1;
@@ -122,13 +60,13 @@ void SimRunner::start() {
                 // Kill the other workers early
                 next_run += static_cast<unsigned int>(plans.size());
                 {
+                    // log_export_mutex is treated as our protection for race conditions on err_detail
                     std::lock_guard<std::mutex> lck(log_export_queue_mutex);
                     log_export_queue.push(UINT_MAX);
-                    // log_export_mutex is treated as our protection for race conditions on fast_err_detail
-                    fast_err_detail.run_id = run_id;
-                    fast_err_detail.device_id = device_id;
-                    fast_err_detail.runner_id = runner_id;
-                    fast_err_detail.exception_string = e.what();
+                    // Build the error detail (fixed len char array for string)
+                    err_detail.push_back(ErrorDetail{run_id, static_cast<unsigned int>(device_id), runner_id, });
+                    strncpy(err_detail.back().exception_string, e.what(), sizeof(ErrorDetail::exception_string)-1);
+                    err_detail.back().exception_string[sizeof(ErrorDetail::exception_string) - 1] = '\0';
                 }
                 return;
             } else {
diff --git a/src/flamegpu/util/cleanup.cu b/src/flamegpu/util/cleanup.cu
index 6b680692e..b66e827bb 100644
--- a/src/flamegpu/util/cleanup.cu
+++ b/src/flamegpu/util/cleanup.cu
@@ -1,5 +1,9 @@
 #include "flamegpu/util/cleanup.h"
 
+#ifdef FLAMEGPU_ENABLE_MPI
+#include <mpi.h>
+#endif
+
 #include <cuda_runtime.h>
 #include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/detail/JitifyCache.h"
@@ -8,6 +12,16 @@ namespace flamegpu {
 namespace util {
 
 void cleanup() {
+#ifdef FLAMEGPU_ENABLE_MPI
+    int init_flag = 0;
+    int fin_flag = 0;
+    // MPI can only be init and finalized once
+    MPI_Initialized(&init_flag);
+    MPI_Finalized(&fin_flag);
+    if (init_flag && !fin_flag) {
+        MPI_Finalize();
+    }
+#endif
     int originalDevice = 0;
     gpuErrchk(cudaGetDevice(&originalDevice));
     // Reset all cuda devices for memcheck / profiling purposes.
diff --git a/swig/python/flamegpu.i b/swig/python/flamegpu.i
index 59d05877f..d2eae31fc 100644
--- a/swig/python/flamegpu.i
+++ b/swig/python/flamegpu.i
@@ -870,7 +870,7 @@ namespace std {
 %template(dependsOn) flamegpu::DependencyNode::dependsOn<flamegpu::DependencyNode>;
 
 %template(StepLogFrameList) std::list<flamegpu::StepLogFrame>;
-%template(RunLogVec) std::vector<flamegpu::RunLog>;
+%template(RunLogMap) std::map<unsigned int, flamegpu::RunLog>;
  
 // Instantiate template versions of agent functions from the API
 TEMPLATE_VARIABLE_INSTANTIATE_ID(newVariable, flamegpu::AgentDescription::newVariable)
@@ -1263,6 +1263,13 @@ TEMPLATE_VARIABLE_INSTANTIATE_INTS(poisson, flamegpu::HostRandom::poisson)
     #define GLM false
 #endif
 
+#ifdef FLAMEGPU_ENABLE_MPI
+    #undef FLAMEGPU_ENABLE_MPI
+    #define MPI true
+#else
+    #define MPI false
+#endif
+
 // Declare an empty type we can use as an attribute for constants to be pulled in by codegen
 %pythoncode {
 class constant:
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f8282ba72..269db9ffa 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.18...3.25 FATAL_ERROR)
 
 # Option to enable GTEST_DISCOVER if tests or tests_dev are enabled. Defaults to off due to runtime increase
-cmake_dependent_option(FLAMEGPU_ENABLE_GTEST_DISCOVER "Enable GTEST_DISCOVER for more detailed ctest output without -VV. This dramitically increases test suite runtime to CUDA context initialisation." OFF "FLAMEGPU_BUILD_TESTS OR FLAMEGPU_BUILD_TESTS_DEV" OFF)
+cmake_dependent_option(FLAMEGPU_ENABLE_GTEST_DISCOVER "Enable GTEST_DISCOVER for more detailed ctest output without -VV. This dramatically increases test suite runtime to CUDA context initialisation." OFF "FLAMEGPU_BUILD_TESTS OR FLAMEGPU_BUILD_TESTS_DEV" OFF)
 
 # Only Do anything if FLAMEGPU_BUILD_TESTS or FLAMEGPU_BUILD_TESTS_DEV is set.
 if(NOT (FLAMEGPU_BUILD_TESTS OR FLAMEGPU_BUILD_TESTS_DEV))
@@ -91,6 +91,10 @@ SET(TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/test_namespaces/test_rtc_namespaces.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/test_version.cpp
 )
+# Source files for the tests_mpi target (distinct suite only created in the presence of FLAMEGPU_ENABLE_MPI)
+SET(TESTS_MPI_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_mpi_ensemble.cu
+)
 # Source files for the tests_dev target 
 SET(TESTS_DEV_SRC
 )
@@ -112,22 +116,25 @@ include(${FLAMEGPU_ROOT}/cmake/CUDAArchitectures.cmake)
 # Include googletest as a dependency.
 include(${FLAMEGPU_ROOT}/cmake/dependencies/googletest.cmake)
 
-# If CTest's GoogleTest integreation is required, include GoogleTest.
+# If CTest's GoogleTest integration is required, include GoogleTest.
 if((FLAMEGPU_BUILD_TESTS OR FLAMEGPU_BUILD_TESTS_DEV) AND FLAMEGPU_ENABLE_GTEST_DISCOVER)
     include(GoogleTest)
 endif()
 
-if(FLAMEGPU_BUILD_TESTS)
+
+# Define a macro for creating a named test project/executable
+# create_test_suite(suite_name, suite_src)
+macro(create_test_project)
     enable_testing()
     # Handle CMAKE_CUDA_ARCHITECTURES and inject code into the tests project() command
-    flamegpu_init_cuda_architectures(PROJECT tests)
+    flamegpu_init_cuda_architectures(PROJECT ${ARGV0})
     # Name the project and set languages
-    project(tests CUDA CXX)
+    project(${ARGV0} CUDA CXX)
     # Include common rules.
     include(${FLAMEGPU_ROOT}/cmake/common.cmake)
-    # Set the source for this projcet
+    # Set the source for this project
     SET(ALL_SRC
-        ${TESTS_SRC}
+        ${${ARGV1}}
         ${HELPERS_SRC}
     )
     # Define output location of binary files
@@ -138,20 +145,18 @@ if(FLAMEGPU_BUILD_TESTS)
     target_include_directories("${PROJECT_NAME}" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
     # Add the targets we depend on (this does link and include)
     target_link_libraries("${PROJECT_NAME}" PRIVATE GTest::gtest)
-    # Pass a tests specific compiler definition for cheking CMAKE_CUDA_ARCHITECTURES behaviour
+    # Pass a tests specific compiler definition for checking CMAKE_CUDA_ARCHITECTURES behaviour
     flamegpu_get_minimum_cuda_architecture(min_cuda_arch)
     target_compile_definitions(${PROJECT_NAME} PRIVATE FLAMEGPU_TEST_MIN_CUDA_ARCH=${min_cuda_arch})
     # Put Within Tests filter
     flamegpu_set_target_folder("${PROJECT_NAME}" "Tests")
-    # Also set as startup project (if top level project)
-    set_property(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"  PROPERTY VS_STARTUP_PROJECT "${PROJECT_NAME}")
     # Set the default (visual studio) debugger configure_file
     set_target_properties("${PROJECT_NAME}" PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
     VS_DEBUGGER_COMMAND_ARGUMENTS "$<$<CONFIG:Debug>:--gtest_catch_exceptions=0> --gtest_filter=*")
-
+    
     # Add the tests target as to ctest, optionally using the gtest_discover integration.
     if(FLAMEGPU_ENABLE_GTEST_DISCOVER)
-        # If GTEST_DISCOVER is enabled, add the unit test executable using it. This results in very long test exeuction times due to CUDA context initialisation per test
+        # If GTEST_DISCOVER is enabled, add the unit test executable using it. This results in very long test execution times due to CUDA context initialisation per test
         gtest_discover_tests(
             "${PROJECT_NAME}"
             WORKING_DIRECTORY
@@ -165,53 +170,17 @@ if(FLAMEGPU_BUILD_TESTS)
             WORKING_DIRECTORY ${PROJECT_DIR}
         )
     endif()
-endif()
+endmacro()
 
-# If the tests_dev target is requirest, create it.
-if(FLAMEGPU_BUILD_TESTS_DEV)
-    enable_testing()
-    # Handle CMAKE_CUDA_ARCHITECTURES and inject code into the tests project() command
-    flamegpu_init_cuda_architectures(PROJECT tests_dev)
-    # DEVELOPMENT TESTING THING (Compact repeated version of above)
-    project(tests_dev CUDA CXX)
-    # Include common rules.
-    include(${FLAMEGPU_ROOT}/cmake/common.cmake)
-    # Set the source for this projcet
-    SET(ALL_SRC
-        ${TESTS_DEV_SRC}
-        ${HELPERS_SRC}
-    )
-    # Define output location of binary files
-    SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}/)
-    # Add the executable and set required flags for the target
-    flamegpu_add_executable("${PROJECT_NAME}" "${ALL_SRC}" "${FLAMEGPU_ROOT}" "${PROJECT_BINARY_DIR}" FALSE)
-    # Add the tests directory to the include path,
-    target_include_directories("${PROJECT_NAME}" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-    # Add the targets we depend on (this does link and include)
-    target_link_libraries("${PROJECT_NAME}" PRIVATE GTest::gtest)
-    # Pass a tests specific compiler definition for cheking CMAKE_CUDA_ARCHITECTURES behaviour
-    flamegpu_get_minimum_cuda_architecture(min_cuda_arch)
-    target_compile_definitions(${PROJECT_NAME} PRIVATE FLAMEGPU_TEST_MIN_CUDA_ARCH=${min_cuda_arch})
-    # Put Within Tests filter
-    flamegpu_set_target_folder("${PROJECT_NAME}" "Tests")
-    # Set the default (visual studio) debugger configure_file
-    set_target_properties("${PROJECT_NAME}" PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-    VS_DEBUGGER_COMMAND_ARGUMENTS "$<$<CONFIG:Debug>:--gtest_catch_exceptions=0> --gtest_filter=*")
-    
-    # Add the tests target as to ctest, optionally using the gtest_discover integration.
-    if(FLAMEGPU_ENABLE_GTEST_DISCOVER)
-        # If GTEST_DISCOVER is enabled, add the unit test executable using it. This results in very long test exeuction times due to CUDA context initialisation per test
-        gtest_discover_tests(
-            "${PROJECT_NAME}"
-            WORKING_DIRECTORY
-            ${PROJECT_DIR}
-        )
-    else()
-        # Otherwise add the whole test suite as a single test. Use ctest -VV to view per-test results in this case.
-        add_test(
-            NAME ${PROJECT_NAME} 
-            COMMAND "${PROJECT_NAME}"
-            WORKING_DIRECTORY ${PROJECT_DIR}
-        )
+if(FLAMEGPU_BUILD_TESTS)
+    # Create main test suite
+    create_test_project(tests TESTS_SRC)
+    # If the tests_mpi target is required, create it.
+    if(FLAMEGPU_ENABLE_MPI)
+        create_test_project(tests_mpi TESTS_MPI_SRC)
     endif()
 endif()
+# If the tests_dev target is required, create it.
+if(FLAMEGPU_BUILD_TESTS_DEV)
+    create_test_project(tests_dev TESTS_DEV_SRC)
+endif()
diff --git a/tests/helpers/main.cu b/tests/helpers/main.cu
index 60547ada1..949e85a53 100644
--- a/tests/helpers/main.cu
+++ b/tests/helpers/main.cu
@@ -7,10 +7,11 @@
 #include "helpers/device_initialisation.h"
 #include "flamegpu/io/Telemetry.h"
 #include "flamegpu/detail/TestSuiteTelemetry.h"
+#include "flamegpu/util/cleanup.h"
 
 
 GTEST_API_ int main(int argc, char **argv) {
-    // Get the current status of telemetry, to control if test suite results shold be submit or not
+    // Get the current status of telemetry, to control if test suite results should be submit or not
     const bool telemetryEnabled = flamegpu::io::Telemetry::isEnabled();
     // Disable telemetry for simulation / ensemble objects in the test suite.
     flamegpu::io::Telemetry::disable();
@@ -22,16 +23,9 @@ GTEST_API_ int main(int argc, char **argv) {
     printf("Running main() from %s\n", __FILE__);
     testing::InitGoogleTest(&argc, argv);
     auto rtn = RUN_ALL_TESTS();
-    // Reset all cuda devices for memcheck / profiling purposes.
-    int devices = 0;
-    gpuErrchk(cudaGetDeviceCount(&devices));
-    if (devices > 0) {
-        for (int device = 0; device < devices; ++device) {
-            gpuErrchk(cudaSetDevice(device));
-            gpuErrchk(cudaDeviceReset());
-        }
-    }
-    // If there were more than 1 tests selected, (to exlcude bad filters and FLAMEGPU_USE_GTEST_DISCOVER related spam)
+    // Reset all cuda devices for memcheck / profiling purposes (and finalize MPI)
+    flamegpu::util::cleanup();
+    // If there were more than 1 tests selected, (to exclude bad filters and FLAMEGPU_USE_GTEST_DISCOVER related spam)
     if (telemetryEnabled && ::testing::UnitTest::GetInstance()->test_to_run_count() > 1) {
         // Detect if -v / --verbose was passed to the test suite binary to log the payload command
         bool verbose = false;
diff --git a/tests/python/io/test_logging.py b/tests/python/io/test_logging.py
index 5a0e5335a..2ff0c6891 100644
--- a/tests/python/io/test_logging.py
+++ b/tests/python/io/test_logging.py
@@ -401,7 +401,7 @@ def test_CUDAEnsembleSimulate(self):
         # Check step log
         run_logs = sim.getLogs();
         i_id = 0;
-        for log in run_logs:
+        for _, log in run_logs.items():
             # Check step log
             steps = log.getStepLog();
             # init log, + 5 logs from 10 steps
diff --git a/tests/python/simulation/test_cuda_ensemble.py b/tests/python/simulation/test_cuda_ensemble.py
index 1729047fe..7ecd61331 100644
--- a/tests/python/simulation/test_cuda_ensemble.py
+++ b/tests/python/simulation/test_cuda_ensemble.py
@@ -324,7 +324,7 @@ def test_setStepLog(self):
         # Get the logs, checking the correct number are present.
         runLogs = ensemble.getLogs()
         assert runLogs.size() == plans.size()
-        for log in runLogs:
+        for _, log in runLogs.items():
             stepLogs = log.getStepLog()
             assert stepLogs.size() == 1 + 1  # This is 1 + 1 due to the always present init log.
             expectedStepCount = 0
@@ -368,7 +368,7 @@ def test_setExitLog(self):
         # Get the logs, checking the correct number are present.
         runLogs = ensemble.getLogs()
         assert runLogs.size() == plans.size()
-        for log in runLogs:
+        for _, log in runLogs.items():
             exitLog = log.getExitLog()
             assert exitLog.getStepCount() == 1
 
diff --git a/tests/python/test_mpi.py b/tests/python/test_mpi.py
new file mode 100644
index 000000000..ef773fe84
--- /dev/null
+++ b/tests/python/test_mpi.py
@@ -0,0 +1,64 @@
+import pytest
+from unittest import TestCase
+from pyflamegpu import *
+import time
+
+
+class model_step(pyflamegpu.HostFunction):
+    def run(self, FLAMEGPU):
+      counter = FLAMEGPU.environment.getPropertyInt("counter")
+      counter+=1
+      FLAMEGPU.environment.setPropertyInt("counter", counter)
+      time.sleep(0.1)  # Sleep 100ms
+    
+class throw_exception(pyflamegpu.HostFunction):
+    def run(self, FLAMEGPU):
+      counter = FLAMEGPU.environment.getPropertyInt("counter");
+      init_counter = FLAMEGPU.environment.getPropertyInt("init_counter");
+      if FLAMEGPU.getStepCounter() == 1 and counter == 8:
+          raise Exception("Exception thrown by host fn throw_exception()");
+
+@pytest.mark.skipif(not pyflamegpu.MPI, reason="pyflamegpu not built with MPI enabled")
+class TestMPIEnsemble(TestCase):
+
+    def test_mpi(self):
+        # init model
+        model = pyflamegpu.ModelDescription("MPITest")
+        model.Environment().newPropertyInt("counter", -1)
+        model.Environment().newPropertyInt("init_counter", -1)
+        model.newAgent("agent")
+        model.newLayer().addHostFunction(model_step())
+        model.newLayer().addHostFunction(throw_exception())
+        # init plans
+        RUNS_PER_RANK = 10
+        world_size = 2  # cant probe mpi easily, so just default to 2
+        plans = pyflamegpu.RunPlanVector(model, RUNS_PER_RANK * world_size);
+        plans.setSteps(10)
+        plans.setPropertyLerpRangeInt("counter", 0, (RUNS_PER_RANK * world_size) - 1)
+        plans.setPropertyLerpRangeInt("init_counter", 0, (RUNS_PER_RANK * world_size) - 1)
+        # init exit logging config
+        exit_log_cfg = pyflamegpu.LoggingConfig(model)
+        exit_log_cfg.logEnvironment("counter")
+        # init ensemble
+        ensemble = pyflamegpu.CUDAEnsemble(model)
+        ensemble.Config().concurrent_runs = 1
+        ensemble.Config().error_level = pyflamegpu.CUDAEnsembleConfig.Off
+        ## Can't do anything fancy like splitting GPUs
+        ensemble.setExitLog(exit_log_cfg)
+        # Run the ensemble
+        err_count = ensemble.simulate(plans)
+        ## Check that 1 error occurred at rank 0
+        ## err_count == 1
+        # Validate logs
+        ## @note Best we can currently do is check logs of each runner have correct results
+        ## @note Ideally we'd validate between nodes to ensure all runs have been completed
+        logs = ensemble.getLogs()
+        for index, log in logs.items():
+            exit_log = log.getExitLog()
+            # Get a logged environment property
+            counter = exit_log.getEnvironmentPropertyInt("counter")
+            assert counter == index + 10
+
+        # Unable to call pyflamegpu.cleanup() in pytest, as other tests might want to use mpi after, but mpi_finalize would prevent that.
+        # Cleanup can only be tested if this is the only test. being executed.
+        # pyflamegpu.cleanup()
diff --git a/tests/python/util/test_cleanup.py b/tests/python/util/test_cleanup.py
index 415cc2d72..157105432 100644
--- a/tests/python/util/test_cleanup.py
+++ b/tests/python/util/test_cleanup.py
@@ -14,6 +14,7 @@ def run(self, FLAMEGPU):
         for i in range(AGENT_COUNT):
             agent.newAgent()
 
+@pytest.mark.skipif(pyflamegpu.MPI, reason="Cleanup cannot be safely tested in pytest with MPI due to use of MPI_Finalzie (if it has been initialised already)")
 class CleanupTest(TestCase):
     """
         Test suite for the flamegpu::util::cleanup
diff --git a/tests/test_cases/io/test_logging.cu b/tests/test_cases/io/test_logging.cu
index 58f780346..2af5ae606 100644
--- a/tests/test_cases/io/test_logging.cu
+++ b/tests/test_cases/io/test_logging.cu
@@ -433,7 +433,7 @@ TEST(LoggingTest, CUDAEnsembleSimulate) {
     {  // Check step log
         const auto &run_logs = sim.getLogs();
         i_id = 0;
-        for (auto &log : run_logs) {
+        for (const auto &[_, log] : run_logs) {
             {  // Check step log
                 auto &steps = log.getStepLog();
                 EXPECT_EQ(steps.size(), 6);  // init log, + 5 logs from 10 steps
diff --git a/tests/test_cases/simulation/test_cuda_ensemble.cu b/tests/test_cases/simulation/test_cuda_ensemble.cu
index f8cbadb5f..5f37a4cfe 100644
--- a/tests/test_cases/simulation/test_cuda_ensemble.cu
+++ b/tests/test_cases/simulation/test_cuda_ensemble.cu
@@ -473,8 +473,8 @@ TEST(TestCUDAEnsemble, setStepLog) {
     // Get the logs, checking the correct number are present.
     const auto &runLogs = ensemble.getLogs();
     EXPECT_EQ(runLogs.size(), plans.size());
-    for (auto &log : runLogs) {
-        auto &stepLogs = log.getStepLog();
+    for (const auto &[_, log] : runLogs) {
+        const auto &stepLogs = log.getStepLog();
         EXPECT_EQ(stepLogs.size(), 1 + 1);  // This is 1 + 1 due to the always present init log.
         uint32_t expectedStepCount = 0;
         for (const auto &stepLog : stepLogs) {
@@ -516,7 +516,7 @@ TEST(TestCUDAEnsemble, setExitLog) {
     // Get the logs, checking the correct number are present.
     const auto &runLogs = ensemble.getLogs();
     EXPECT_EQ(runLogs.size(), plans.size());
-    for (auto &log : runLogs) {
+    for (const auto &[_, log] : runLogs) {
         const auto &exitLog = log.getExitLog();
         ASSERT_EQ(exitLog.getStepCount(), 1u);
     }
diff --git a/tests/test_cases/simulation/test_mpi_ensemble.cu b/tests/test_cases/simulation/test_mpi_ensemble.cu
new file mode 100644
index 000000000..dfb1fae53
--- /dev/null
+++ b/tests/test_cases/simulation/test_mpi_ensemble.cu
@@ -0,0 +1,476 @@
+#include <chrono>
+#include <thread>
+
+#ifdef FLAMEGPU_ENABLE_MPI
+#include <mpi.h>
+#endif
+
+#include "flamegpu/flamegpu.h"
+#include "flamegpu/simulation/detail/MPIEnsemble.h"
+#include "gtest/gtest.h"
+
+
+namespace flamegpu {
+namespace tests {
+namespace test_mpi_ensemble {
+/**
+ * Testing MPI with GoogleTest is somewhat impractical
+ * Therefore this suite intends to run a single complete test depending on the run state
+ * Not built with FLAMEGPU_ENABLE_MPI: All TestMPIEnsemble disabled
+ * Not executed with mpirun: MPI initialises with a world_size 1, the model should execute as normal
+ * Executed on a single node TestMPIEnsemble 'mpirun -n 4 ./bin/Debug/tests', TestMPIEnsemble.local will run
+ * Executed on multiple nodes TestMPIEnsemble 'mpirun -n $SLURM_JOB_NUM_NODES -hosts $SLURM_JOB_NODELIST ./bin/Debug/tests', TestMPIEnsemble.multi will run
+ *
+ * Error tests do not currently control whether the error occurs on rank==0 or rank != 0
+ * These errors are handled differently, so it would benefit from duplicate tests (for multi-rank runs)
+ *
+ * MPI_Init() and MPI_Finalize() can only be called once each
+ *
+ * Ideally these tests should be executed with less mpi ranks than gpus, the same number of mpi ranks as gpus, and more mpi ranks than gpus.
+ */
+#ifdef FLAMEGPU_ENABLE_MPI
+int has_error = 0;
+FLAMEGPU_STEP_FUNCTION(model_step) {
+    int counter = FLAMEGPU->environment.getProperty<int>("counter");
+    counter+=1;
+    FLAMEGPU->environment.setProperty<int>("counter", counter);
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+}
+FLAMEGPU_STEP_FUNCTION(throw_exception_rank_0) {
+    int world_rank = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+    if (world_rank != 0) return;
+    const int counter = FLAMEGPU->environment.getProperty<int>("counter");
+    const int init_counter = FLAMEGPU->environment.getProperty<int>("init_counter");
+    if (FLAMEGPU->getStepCounter() == 1 && counter > 6 && has_error < 2) {
+        ++has_error;
+        // printf("exception counter: %u, init_counter: %u\n", counter, init_counter);
+        throw std::runtime_error("Exception thrown by host fn throw_exception()");
+    }
+}
+FLAMEGPU_STEP_FUNCTION(throw_exception_rank_1) {
+    int world_rank = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+    if (world_rank != 1) return;
+    const int counter = FLAMEGPU->environment.getProperty<int>("counter");
+    const int init_counter = FLAMEGPU->environment.getProperty<int>("init_counter");
+    if (FLAMEGPU->getStepCounter() == 1 && counter > 6 && has_error < 2) {
+        ++has_error;
+        // printf("exception counter: %u, init_counter: %u\n", counter, init_counter);
+        throw std::runtime_error("Exception thrown by host fn throw_exception()");
+    }
+}
+class TestMPIEnsemble : public testing::Test {
+ protected:
+    void SetUp() override {
+        has_error = 0;
+        int flag = 0;
+        MPI_Finalized(&flag);
+        if (flag) {
+            GTEST_SKIP() << "Skipping MPI test, MPI_Finalize() has been called";
+            return;
+        }
+        // Init MPI, fetch rank and size
+        int thread_provided = 0;
+        // MPI can only be init once, for certain test cases we do some initial MPI comms for setup
+        MPI_Initialized(&flag);
+        if (!flag) {
+            // MPI single means that only the main thread will perform MPI actions
+            MPI_Init_thread(NULL, NULL, MPI_THREAD_SINGLE, &thread_provided);
+            if (thread_provided != MPI_THREAD_SINGLE) {
+                THROW exception::UnknownInternalError("MPI unable to provide MPI_THREAD_SINGLE support");
+            }
+        }
+        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+        // Create a shared memory split
+        MPI_Comm group;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
+        MPI_Comm_size(group, &group_size);
+        if (world_size < 1) {
+            GTEST_SKIP() << "world_size<1, something went wrong.";
+        }
+        initModel();
+    }
+    void TearDown() override {
+        if (ensemble) delete ensemble;
+        if (exit_log_cfg) delete exit_log_cfg;
+        if (plans) delete plans;
+        if (model) delete model;
+    }
+    void initModel() {
+        model = new flamegpu::ModelDescription("MPITest");
+        model->Environment().newProperty<int>("counter", -1);
+        model->Environment().newProperty<int>("init_counter", -1);
+        model->newAgent("agent");
+        model->newLayer().addHostFunction(model_step);
+    }
+    void initPlans() {
+        // 10 runs per rank, steps which take >100ms each,
+        // therefore each sim will take >1 second.
+        // Estimate 10 seconds for full test across any world size
+        const int RUNS_PER_RANK = 10;
+        plans = new flamegpu::RunPlanVector(*model, RUNS_PER_RANK * world_size);
+        plans->setSteps(10);
+        plans->setPropertyLerpRange<int>("counter", 0, (RUNS_PER_RANK * world_size) - 1);
+        plans->setPropertyLerpRange<int>("init_counter", 0, (RUNS_PER_RANK * world_size) - 1);
+    }
+    void initExitLoggingConfig() {
+        exit_log_cfg = new flamegpu::LoggingConfig(*model);
+        exit_log_cfg->logEnvironment("counter");
+    }
+    void initEnsemble() {
+        initPlans();
+        initExitLoggingConfig();
+        ensemble = new flamegpu::CUDAEnsemble(*model);
+        ensemble->Config().concurrent_runs = 1;
+        ensemble->setExitLog(*exit_log_cfg);
+    }
+    void validateLogs() {
+        // Validate results
+        // @note Best we can currently do is check logs of each runner have correct results
+        // @note Ideally we'd validate between nodes to ensure all runs have been completed
+        const std::map<unsigned int, RunLog> logs = ensemble->getLogs();
+        for (const auto &[index, log] : logs) {
+            const ExitLogFrame& exit_log = log.getExitLog();
+            // Get a logged environment property
+            const int counter = exit_log.getEnvironmentProperty<int>("counter");
+            EXPECT_EQ(counter, index + 10);
+        }
+    }
+    int world_rank = -1;
+    int group_size = -1;
+    int world_size = -1;
+    flamegpu::ModelDescription *model = nullptr;
+    flamegpu::RunPlanVector *plans = nullptr;
+    flamegpu::LoggingConfig *exit_log_cfg = nullptr;
+    flamegpu::CUDAEnsemble *ensemble = nullptr;
+};
+TEST_F(TestMPIEnsemble, success) {
+    initEnsemble();
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    const unsigned int err_count = ensemble->simulate(*plans);
+    EXPECT_EQ(err_count, 0u);
+    // Get stderr and stdout
+    std::string output = testing::internal::GetCapturedStdout();
+    std::string errors = testing::internal::GetCapturedStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(errors.empty() || errors.find("Warning: MPI Ensemble launched with") != std::string::npos);  // can't fully match the error message as it contains mpirun/machine specific numbers
+    } else {
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, success_verbose) {
+    initEnsemble();
+    ensemble->Config().verbosity = Verbosity::Verbose;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    const unsigned int err_count = ensemble->simulate(*plans);
+    EXPECT_EQ(err_count, 0u);
+    // Get stderr and stdout
+    std::string output = testing::internal::GetCapturedStdout();
+    std::string errors = testing::internal::GetCapturedStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(output.find("Ensemble time elapsed") != std::string::npos);   // E.g. Ensemble time elapsed: 0.006000s
+        EXPECT_TRUE(errors.empty() || errors.find("Warning: MPI Ensemble launched with") != std::string::npos);  // can't fully match the error message as it contains mpirun/machine specific numbers
+    } else {
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+
+    validateLogs();
+}
+
+TEST_F(TestMPIEnsemble, error_off_rank_0) {
+    model->newLayer().addHostFunction(throw_exception_rank_0);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Off;
+    unsigned int err_count = 0;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    EXPECT_NO_THROW(err_count = ensemble->simulate(*plans));
+    // Get stderr and stdout
+    const std::string output = testing::internal::GetCapturedStdout();
+    const std::string errors = testing::internal::GetCapturedStderr();
+    // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+    // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        // With error off, we would expect to see run index 10 fail
+        // Therefore 1 returned instead of 0
+        EXPECT_EQ(err_count, 2u);
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(errors.find("Warning: Run ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+    } else {
+        // Capture stderr and stdout
+        // Only rank 0 returns the error count
+        EXPECT_EQ(err_count, 0u);
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+
+    // Existing logs should still validate
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, error_slow_rank_0) {
+    model->newLayer().addHostFunction(throw_exception_rank_0);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Slow;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_THROW(ensemble->simulate(*plans), flamegpu::exception::EnsembleError);
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(errors.find("Warning: Run ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+    } else {
+        // Only rank 0 raises exception
+        EXPECT_NO_THROW(ensemble->simulate(*plans));
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+    // Existing logs should still validate
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, error_fast_rank_0) {
+    model->newLayer().addHostFunction(throw_exception_rank_0);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Fast;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_THROW(ensemble->simulate(*plans), flamegpu::exception::EnsembleError);
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+#ifdef _DEBUG
+        EXPECT_TRUE(errors.find("Run ") != std::string::npos);  // E.g. Run 5 failed on rank 0, device 1, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Run 5 failed on rank 0, device 1, thread 0 with exception:
+#else
+        EXPECT_TRUE(errors.empty() || errors.find("Warning: MPI Ensemble launched with") != std::string::npos);  // can't fully match the error message as it contains mpirun/machine specific numbers
+#endif
+    } else {
+        // Only rank 0 raises exception and captures total number of successful/failed runs
+        EXPECT_NO_THROW(ensemble->simulate(*plans));
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+
+    // Existing logs should still validate
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, error_off_rank_1) {
+    if (world_size == 1) {
+        GTEST_SKIP() << "world_size==1, test not applicable.";
+    }
+    model->newLayer().addHostFunction(throw_exception_rank_1);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Off;
+    unsigned int err_count = 0;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    EXPECT_NO_THROW(err_count = ensemble->simulate(*plans));
+    // Get stderr and stdout
+    const std::string output = testing::internal::GetCapturedStdout();
+    const std::string errors = testing::internal::GetCapturedStderr();
+    // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+    // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        // With error off, we would expect to see run index 10 fail
+        // Therefore 1 returned instead of 0
+        EXPECT_EQ(err_count, 2u);
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(errors.find("Warning: Run ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+    } else {
+        // Capture stderr and stdout
+        // Only rank 0 returns the error count
+        EXPECT_EQ(err_count, 0u);
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+    // Existing logs should still validate
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, error_slow_rank_1) {
+    if (world_size == 1) {
+        GTEST_SKIP() << "world_size==1, test not applicable.";
+    }
+    model->newLayer().addHostFunction(throw_exception_rank_1);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Slow;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_THROW(ensemble->simulate(*plans), flamegpu::exception::EnsembleError);
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+        EXPECT_TRUE(output.find("CUDAEnsemble completed") != std::string::npos);  // E.g. CUDAEnsemble completed 2 runs successfully!
+        EXPECT_TRUE(errors.find("Warning: Run ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Warning: Run 10/10 failed on rank 0, device 0, thread 0 with exception:
+    } else {
+        // Only rank 0 raises exception
+        EXPECT_NO_THROW(ensemble->simulate(*plans));
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+    // Existing logs should still validate
+    validateLogs();
+}
+TEST_F(TestMPIEnsemble, error_fast_rank_1) {
+    if (world_size == 1) {
+        GTEST_SKIP() << "world_size==1, test not applicable.";
+    }
+    model->newLayer().addHostFunction(throw_exception_rank_1);
+    initEnsemble();
+    ensemble->Config().error_level = CUDAEnsemble::EnsembleConfig::Fast;
+    // Capture stderr and stdout
+    testing::internal::CaptureStdout();
+    testing::internal::CaptureStderr();
+    // Expect no warnings (stderr) but outputs on progress and timing
+    if (world_rank == 0) {
+        EXPECT_THROW(ensemble->simulate(*plans), flamegpu::exception::EnsembleError);
+        // @todo can't capture total number of successful/failed runs
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.find("MPI ensemble assigned run ") != std::string::npos);   // E.g. MPI ensemble assigned run %d/%u to rank %d
+#ifdef _DEBUG
+        EXPECT_TRUE(errors.find("Run ") != std::string::npos);  // E.g. Run 5 failed on rank 1, device 1, thread 0 with exception:
+        EXPECT_TRUE(errors.find(" failed on rank ") != std::string::npos);  // E.g. Run 5 failed on rank 1, device 1, thread 0 with exception:
+#else
+        EXPECT_TRUE(errors.empty() || errors.find("Warning: MPI Ensemble launched with") != std::string::npos);  // can't fully match the error message as it contains mpirun/machine specific numbers
+#endif
+    } else {
+        // Only rank 0 raises exception and captures total number of successful/failed runs
+        EXPECT_NO_THROW(ensemble->simulate(*plans));
+        // Get stderr and stdout
+        const std::string output = testing::internal::GetCapturedStdout();
+        const std::string errors = testing::internal::GetCapturedStderr();
+        // printf("[%d]output:\n:%s\n", world_rank, output.c_str());
+        // printf("[%d]errors:\n:%s\n", world_rank, errors.c_str());
+        EXPECT_TRUE(output.empty());
+        EXPECT_TRUE(errors.empty());
+    }
+
+    // Existing logs should still validate
+    validateLogs();
+}
+// This test doesn't want to use the fixture, so must use a differnet test suite name
+TEST(TestMPIEnsembleNoFixture, devicesForThisRank) {
+    // Call the static, testable version of devicesForThisRank assigns the correct number of devices to the "current" rank, faking the mpi rank and size to make this testable regardless of mpirun config.
+
+    // with 1 local mpi rank, the full set of devices should be used regarldes of count
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0}, 1, 0), std::set<int>({0}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1}, 1, 0), std::set<int>({0, 1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({1, 2, 3}, 1, 0), std::set<int>({1, 2, 3}));
+
+    // with the same number of ranks as gpus, each rank should get thier respective gpu
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 4, 0), std::set<int>({0}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 4, 1), std::set<int>({1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 4, 2), std::set<int>({2}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 4, 3), std::set<int>({3}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({2, 3}, 2, 0), std::set<int>({2}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({2, 3}, 2, 1), std::set<int>({3}));
+
+    // With fewer ranks than gpus, gpus should be load balanced, with lower ranks having extra gpus if the number is not divisible
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 2, 0), std::set<int>({0, 1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 2, 1), std::set<int>({2, 3}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2}, 2, 0), std::set<int>({0, 1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2}, 2, 1), std::set<int>({2}));
+
+    // with more ranks than gpus, each rank should get 1 or 0 gpus.
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0}, 3, 0), std::set<int>({0}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0}, 3, 1), std::set<int>({}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0}, 3, 2), std::set<int>({}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 0), std::set<int>({0}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 1), std::set<int>({1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 2), std::set<int>({2}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 3), std::set<int>({3}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 4), std::set<int>({}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({0, 1, 2, 3}, 6, 5), std::set<int>({}));
+
+    // std::set is sorted, not ordered, so the selected device will use lower device ids for lower ranks
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({3, 2, 1}, 3, 0), std::set<int>({1}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({3, 2, 1}, 3, 1), std::set<int>({2}));
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({3, 2, 1}, 3, 2), std::set<int>({3}));
+
+    // no devices, set should be empty regardless of mpi size / rank
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({}, 1, 0), std::set<int>());
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({}, 2, 0), std::set<int>());
+    EXPECT_EQ(flamegpu::detail::MPIEnsemble::devicesForThisRank({}, 2, 1), std::set<int>());
+}
+#else
+TEST(TestMPIEnsemble, DISABLED_success) { }
+TEST(TestMPIEnsemble, DISABLED_success_verbose) { }
+TEST(TestMPIEnsemble, DISABLED_error_off_rank_0) { }
+TEST(TestMPIEnsemble, DISABLED_error_slow_rank_0) { }
+TEST(TestMPIEnsemble, DISABLED_error_fast_rank_0) { }
+TEST(TestMPIEnsemble, DISABLED_error_off_rank_1) { }
+TEST(TestMPIEnsemble, DISABLED_error_slow_rank_1) { }
+TEST(TestMPIEnsemble, DISABLED_error_fast_rank_1) { }
+TEST(TestMPIEnsembleNoFixture, DISABLED_devicesForThisRank) { }
+#endif
+
+}  // namespace test_mpi_ensemble
+}  // namespace tests
+}  // namespace flamegpu
diff --git a/tests/test_cases/util/test_cleanup.cu b/tests/test_cases/util/test_cleanup.cu
index 0bbe42c8c..a9c8580eb 100644
--- a/tests/test_cases/util/test_cleanup.cu
+++ b/tests/test_cases/util/test_cleanup.cu
@@ -26,7 +26,7 @@ FLAMEGPU_AGENT_FUNCTION(alive, MessageNone, MessageNone) {
 
 // Test the getting of a device's compute capability.
 TEST(TestCleanup, Explicit) {
-    // Allocate some arbitraty device memory.
+    // Allocate some arbitrary device memory.
     int * d_int = nullptr;
     gpuErrchk(cudaMalloc(&d_int, sizeof(int)));
     // Validate that the ptr is a valid device pointer