diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..e784b53b7ce1f 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,7 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
 recopixelvertexing = cms.Sequence(recopixelvertexingTask)
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..ea9e4b1e4e037 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -20,3 +20,6 @@
 )
 
 
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexHeterogeneousProducer_cfi import pixelVertexHeterogeneousProducer as _pixelVertexHeterogeneousProducer
+gpu.toReplaceWith(pixelVertices, _pixelVertexHeterogeneousProducer)
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index ad1f03999fbea..dc3b98f8456a5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -21,5 +21,6 @@
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinder_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index a92c116702231..d1f508ca98798 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -4,6 +4,11 @@
 #include<cmath>
 
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
+
+
 using namespace  gpuVertexFinder;
 #include <cuda/api_wrappers.h>
 
@@ -81,7 +86,8 @@ int main() {
   }
 
   auto current_device = cuda::device::current::get();
-
+  
+  auto ntrks_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
   auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
@@ -96,11 +102,13 @@ int main() {
   auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
 
   auto nv_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
+  auto nv2_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
  
   auto onGPU_d = cuda::memory::device::make_unique<OnGPU[]>(current_device, 1);
 
   OnGPU onGPU;
 
+  onGPU.ntrks = ntrks_d.get();
   onGPU.zt = zt_d.get();
   onGPU.ezt2 = ezt2_d.get();
   onGPU.ptt2 = ptt2_d.get();
@@ -109,7 +117,8 @@ int main() {
   onGPU.chi2 = chi2_d.get();
   onGPU.ptv2 = ptv2_d.get();
   onGPU.sortInd = ind_d.get();
-  onGPU.nv = nv_d.get();
+  onGPU.nvFinal = nv_d.get();
+  onGPU.nvIntermediate = nv2_d.get();
   onGPU.izt = izt_d.get();
   onGPU.nn = nn_d.get();
   onGPU.iv = iv_d.get();
@@ -131,7 +140,8 @@ int main() {
   gen(ev);
   
   std::cout << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-
+  auto nt = ev.ztrack.size();
+  cuda::memory::copy(onGPU.ntrks,&nt,sizeof(uint32_t));
   cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
   cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
   cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
@@ -143,51 +153,101 @@ int main() {
   if ( (i%4) == 0 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.02f,12.0f
 		 );
   
   if ( (i%4) == 1 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.02f,9.0f
 		 );
   
   if ( (i%4) == 2 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.01f,9.0f
 		 );
   
   if ( (i%4) == 3 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
+		 onGPU_d.get(),kk,0.7f*eps,
 		 0.01f,9.0f
 		 );
-  
+  cudaCheck(cudaGetLastError());
   cudaDeviceSynchronize();
+
+  cuda::launch(fitVertices, 
+               { 1,1024-256 },
+               onGPU_d.get(),50.f
+              );
+  cudaCheck(cudaGetLastError());
+
+  uint32_t nv;
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+  float chi2[2*nv];  // make space for splitting...
+  float zv[2*nv];
+  float wv[2*nv];
+  float ptv2[2*nv];
+  int32_t nn[2*nv];
+  uint16_t ind[2*nv];
+
+  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "after fit min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(), 50.f
+              );
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
+  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "before splitting min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+  cuda::launch(splitVertices,
+               { 1024, 64 },
+               onGPU_d.get(),
+               9.f
+              );
+ cuda::memory::copy(&nv, onGPU.nvIntermediate, sizeof(uint32_t));
+ std::cout << "after split " << nv << std::endl; 
+
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(),5000.f
+              );
+  cudaCheck(cudaGetLastError());
+
+
   cuda::launch(sortByPt2,
                { 1, 256 },
-               ev.ztrack.size(), onGPU_d.get()
+               onGPU_d.get()
               );
 
-  uint32_t nv;
-  cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
 
   if (nv==0) {
     std::cout << "NO VERTICES???" << std::endl;
     continue;
   }
 
-  float zv[nv];
-  float	wv[nv];
-  float	chi2[nv];
-  float ptv2[nv];
-  int32_t nn[nv];
-  uint16_t ind[nv];
+
   cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
   cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
   cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
@@ -195,15 +255,16 @@ int main() {
   cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
   cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
   for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
-   
   {
-    auto mx = std::minmax_element(wv,wv+nv);
-    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
   }
+
   {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    auto mx = std::minmax_element(wv,wv+nv);
+    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
   }
+
   {
     auto mx = std::minmax_element(ptv2,ptv2+nv);
     std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
@@ -212,16 +273,15 @@ int main() {
   }  
 
   float dd[nv];
-  uint32_t ii=0;
-  for (auto zr : zv) {
+  for (auto kv=0U; kv<nv; ++kv) {
+   auto zr = zv[kv];
    auto md=500.0f;
    for (auto zs : ev.ztrack) { 
      auto d = std::abs(zr-zs);
      md = std::min(d,md);
    }
-   dd[ii++] = md;
+   dd[kv] = md;
   }
-  assert(ii==nv);
   if (i==6) {
     for (auto d:dd) std::cout << d << ' ';
     std::cout << std::endl;