diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 717cbf777fcdb..796241eaf50ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
                uint32_t nHits,
-               uint32_t stride, bool checkTrack) {
+               bool checkTrack) {
 
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
@@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
     uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
     auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
 
-    auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
-    auto idx = ldx/stride;
-    auto first = ldx - idx*stride;
-    assert(first<stride);
+    // x run faster...
+    auto idy = threadIdx.y + blockIdx.y * blockDim.y;
+    auto first = threadIdx.x;
 
-    if (idx>=nHits) return;
-    auto const & vc = isOuterHitOfCell[idx];
+    if (idy>=nHits) return;
+    auto const & vc = isOuterHitOfCell[idy];
     auto s = vc.size();
     if (s<2) return;
     // if alligned kill one of the two.
@@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
       ++sg;
     }
     if (sg<2) return;   
-    // here we parallelize
-    for (uint32_t ic=first; ic<sg-1;  ic+=stride) {
+    // here we parallelize 
+    for (uint32_t ic=first; ic<sg-1;  ic+=blockDim.x) {
       auto & ci = cells[cc[ic]];
       for    (auto jc=ic+1; jc<sg; ++jc) {
         auto & cj = cells[cc[jc]];
@@ -90,4 +89,4 @@ namespace gpuPixelDoublets {
 
 }
 
-#endif
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 02a175fcc2903..192efd0d8919f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -50,8 +50,11 @@ namespace gpuPixelDoublets {
     }
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
 
       uint32_t pairLayerId=0;
       while (j >= innerLayerCumulativeSize[pairLayerId++]);
@@ -115,7 +118,8 @@ namespace gpuPixelDoublets {
           nmin += hist.size(kk+hoff);
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
-        for (;p < e; ++p) {
+        p+=first;
+        for (;p < e; p+=stride) {
           auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
@@ -139,7 +143,7 @@ namespace gpuPixelDoublets {
     }  // loop in block...
   }
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__