diff --git a/.github/workflows/README.rst b/.github/workflows/README.rst
index 8d98d0981b..89e4b31d67 100644
--- a/.github/workflows/README.rst
+++ b/.github/workflows/README.rst
@@ -9,24 +9,30 @@
 DevCloud Runners
 ================
 
-We have 2 self-hosted runners on devcloud. We run in tmux so it will
-continue to run after disconnecting. ``ssh`` to DevCloud::
+We have 2 self-hosted runners on devcloud. We run in an emacs daemon
+so it will continue to run after disconnecting. ``ssh`` to DevCloud
+and start an emacs daemon::
+
+  emacs --daemon
+
+Inside an emacs shell::
 
-  tmux
   cd github/runner-1
   ./run.sh
 
-For the second runner, split the window with ``control-b "`` and start
-the runner::
+Rename the shell to ``devcloud-1`` and start another emacs shell::
 
   cd github/runner-2
   ./run.sh
 
-To check on a runner ``ssh`` to DevCloud and do::
+If you disconnect emacs and the runners will continue to run. To check
+on a runner ``ssh`` to DevCloud and do::
 
-  tmux ls
+  emacsclient -nw
 
-To see the sessions. Usually there is just one name ``0``. Attach to
-the session::
+And visit ``devcloud-1`` and ``devcloud-2`` buffers.
 
-  tmux attach -t 0
+``tmux`` is an alternative. I do not use it because devcloud allows 4
+logins via ``ssh``. Using ``tmux`` with 2 runners consumes 3 even
+while it is detached. You may create a situation where you cannot log
+in because ``tmux`` is consuming all the logins.
diff --git a/include/dr/mhp/algorithms/md_for_each.hpp b/include/dr/mhp/algorithms/md_for_each.hpp
index 6928ab62f2..ccd98d52c2 100644
--- a/include/dr/mhp/algorithms/md_for_each.hpp
+++ b/include/dr/mhp/algorithms/md_for_each.hpp
@@ -142,8 +142,15 @@ void for_each(F op, is_mdspan_view auto &&...drs) {
         auto invoke_index = [=](auto index) {
           // Transform mdspans into references
           auto references = detail::tie_transform(
-              operand_mdspans, [index](auto mdspan) -> decltype(auto) {
-                return mdspan(index[0], index[1]);
+              operand_mdspans, [mdspan0, index](auto mdspan) -> decltype(auto) {
+                static_assert(1 <= mdspan0.rank() && mdspan0.rank() <= 3);
+                if constexpr (mdspan0.rank() == 1) {
+                  return mdspan(index[0]);
+                } else if constexpr (mdspan0.rank() == 2) {
+                  return mdspan(index[0], index[1]);
+                } else if constexpr (mdspan0.rank() == 3) {
+                  return mdspan(index[0], index[1], index[2]);
+                }
               });
           static_assert(
               std::invocable<F, decltype(references)> ||
@@ -160,13 +167,20 @@ void for_each(F op, is_mdspan_view auto &&...drs) {
           }
         };
 
-        // TODO: Extend sycl_utils.hpp to handle ranges > 1D. It uses
-        // ndrange and handles > 32 bits.
-
-        dr::__detail::parallel_for(
-            mhp::sycl_queue(),
-            sycl::range(mdspan0.extent(0), mdspan0.extent(1)), invoke_index)
-            .wait();
+        if constexpr (mdspan0.rank() == 1) {
+          auto range = sycl::range(mdspan0.extent(0));
+          dr::__detail::parallel_for(mhp::sycl_queue(), range, invoke_index)
+              .wait();
+        } else if constexpr (mdspan0.rank() == 2) {
+          auto range = sycl::range(mdspan0.extent(0), mdspan0.extent(1));
+          dr::__detail::parallel_for(mhp::sycl_queue(), range, invoke_index)
+              .wait();
+        } else if constexpr (mdspan0.rank() == 3) {
+          auto range = sycl::range(mdspan0.extent(0), mdspan0.extent(1),
+                                   mdspan0.extent(2));
+          dr::__detail::parallel_for(mhp::sycl_queue(), range, invoke_index)
+              .wait();
+        }
 #else
         assert(false);
 #endif
diff --git a/scripts/devcloud-test.sh b/scripts/devcloud-test.sh
index f898dfeabd..87898f39ef 100755
--- a/scripts/devcloud-test.sh
+++ b/scripts/devcloud-test.sh
@@ -26,9 +26,10 @@ echo "::group::SHP GPU Test"
 ONEAPI_DEVICE_SELECTOR=level_zero:0 time ctest --test-dir build -L SHP
 echo "::endgroup::"
 
-echo "::group::SHP CPU Test"
-ONEAPI_DEVICE_SELECTOR=opencl:cpu time ctest --test-dir build -L SHP
-echo "::endgroup::"
+# disabled: very slow or fails when cryptominer is on devcloud
+#echo "::group::SHP CPU Test"
+#ONEAPI_DEVICE_SELECTOR=opencl:cpu time ctest --test-dir build -L SHP
+#echo "::endgroup::"
 
 echo "::group::MHP GPU Test"
 ONEAPI_DEVICE_SELECTOR=level_zero:* time ctest --test-dir build -L MHP
diff --git a/test/gtest/mhp/mdstar.cpp b/test/gtest/mhp/mdstar.cpp
index b9dc948469..1d47b0e824 100644
--- a/test/gtest/mhp/mdstar.cpp
+++ b/test/gtest/mhp/mdstar.cpp
@@ -261,6 +261,46 @@ TEST_F(Mdarray, Enumerate) {
   static_assert(dr::distributed_range<decltype(e)>);
 }
 
+TEST_F(Mdarray, Slabs) {
+  // local_mdspan is not accessible for device memory
+  if (options.count("device-memory")) {
+    return;
+  }
+
+  // leading dimension decomp of 3d array creates slabs
+  xhp::distributed_mdarray<T, 3> mdarray(extents3d);
+  for (auto slab : dr::mhp::local_mdspans(mdarray)) {
+    for (std::size_t i = 0; i < slab.extent(0); i++) {
+      for (std::size_t j = 0; j < slab.extent(1); j++) {
+        for (std::size_t k = 0; k < slab.extent(2); k++) {
+          slab(i, j, k) = 1;
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(mdarray.mdspan()(0, 0, 0), 1);
+  EXPECT_EQ(
+      mdarray.mdspan()(extents3d[0] - 1, extents3d[1] - 1, extents3d[2] - 1),
+      1);
+}
+
+TEST_F(Mdarray, MdForEach3d) {
+  // leading dimension decomp of 3d array creates slabs
+  xhp::distributed_mdarray<T, 3> mdarray(extents3d);
+  std::vector<T> local(extents3d[0] * extents3d[1] * extents3d[2], 0);
+  rng::iota(local, 0);
+
+  auto set = [d1 = extents3d[1], d2 = extents3d[2]](auto index, auto v) {
+    auto &[o] = v;
+    o = index[0] * d1 * d2 + index[1] * d2 + index[2];
+  };
+  dr::mhp::for_each(set, mdarray);
+
+  EXPECT_EQ(xhp::views::take(mdarray.view(), local.size()), local)
+      << mdrange_message(mdarray);
+}
+
 using Submdspan = Mdspan;
 
 TEST_F(Submdspan, StaticAssert) {