Merge pull request #151 from atillack/cuda_optimization

Cuda optimizations and other loose ends
ccsb-scripps · Sep 1, 2021 · 0fe820f · 0fe820f
2 parents 99f602c + f8fd68f
commit 0fe820f
Show file tree

Hide file tree

Showing 10 changed files with 25 additions and 139 deletions.
diff --git a/cuda/auxiliary_genetic.cu b/cuda/auxiliary_genetic.cu
@@ -77,82 +77,3 @@ inline __device__ void map_angle(float& angle)
 	}
 }
 
-#if 0
-// -------------------------------------------------------
-//
-// -------------------------------------------------------
-void gpu_perform_elitist_selection(
-                                         int    dockpars_pop_size,
-                          __global       float* restrict dockpars_energies_current,
-                          __global       float* restrict dockpars_energies_next,
-                          __global       int*   restrict dockpars_evals_of_new_entities,
-                                         int    dockpars_num_of_genes,
-                          __global       float* restrict dockpars_conformations_next,
-                          __global const float* restrict dockpars_conformations_current,
-                                 __local float* best_energies,
-                                 __local int*   best_IDs,
-                                 __local int*   best_ID
-                                  )
-// The GPU device function performs elitist selection,
-// that is, it looks for the best entity in conformations_current and
-// energies_current of the run that corresponds to the block ID,
-// and copies it to the place of the first entity in
-// conformations_next and energies_next.
-{
-	int entity_counter;
-	int gene_counter;
-	float best_energy;
-
-	if (get_local_id(0) < dockpars_pop_size) {
-		best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+get_local_id(0)];
-		best_IDs[get_local_id(0)] = get_local_id(0);
-	}
-
-	for (entity_counter = NUM_OF_THREADS_PER_BLOCK+get_local_id(0);
-	     entity_counter < dockpars_pop_size;
-	     entity_counter+= NUM_OF_THREADS_PER_BLOCK)
-	{
-		if (dockpars_energies_current[get_group_id(0)+entity_counter] < best_energies[get_local_id(0)]) {
-			best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+entity_counter];
-			best_IDs[get_local_id(0)] = entity_counter;
-		}
-	}
-
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// This could be implemented with a tree-like structure
-	// which may be slightly faster
-	if (get_local_id(0) == 0)
-	{
-		best_energy = best_energies[0];
-		best_ID[0] = best_IDs[0];
-
-		for (entity_counter = 1;
-		     entity_counter < NUM_OF_THREADS_PER_BLOCK;
-		     entity_counter++)
-		{
-			if ((best_energies[entity_counter] < best_energy) && (entity_counter < dockpars_pop_size)) {
-				best_energy = best_energies[entity_counter];
-				best_ID[0] = best_IDs[entity_counter];
-			}
-		}
-
-		// Setting energy value of new entity
-		dockpars_energies_next[get_group_id(0)] = best_energy;
-
-		// Zero (0) evals were performed for entity selected with elitism (since it was copied only)
-		dockpars_evals_of_new_entities[get_group_id(0)] = 0;
-	}
-
-	// "best_id" stores the id of the best entity in the population,
-	// Copying genotype and energy value to the first entity of new population
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	for (gene_counter = get_local_id(0);
-	     gene_counter < dockpars_num_of_genes;
-	     gene_counter+= NUM_OF_THREADS_PER_BLOCK)
-	{
-		dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0) + GENOTYPE_LENGTH_IN_GLOBMEM*best_ID[0]+gene_counter];
-	}
-}
-#endif
diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu
@@ -119,7 +119,6 @@ __device__ void gpu_calc_energrad(
 	uint32_t  g2 = cData.dockpars.gridsize_x_times_y;
 	uint32_t  g3 = cData.dockpars.gridsize_x_times_y_times_z;
 
-	__threadfence();
 	__syncthreads();
 
 	// ================================================
@@ -183,8 +182,7 @@ __device__ void gpu_calc_energrad(
 			calc_coords[atom_id].z = qt.z + rotation_movingvec.z;
 
 		} // End if-statement not dummy rotation
-		__threadfence();
-		__syncthreads();
+			__syncthreads();
 	} // End rotation_counter for-loop
 
 	// ================================================
@@ -409,7 +407,6 @@ __device__ void gpu_calc_energrad(
 		gradient[atom_id].z += lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * gz)));
 #endif
 	} // End atom_id for-loop (INTERMOLECULAR ENERGY)
-	__threadfence();
 	__syncthreads();
 
 	// Inter- and intra-molecular energy calculation
@@ -595,7 +592,6 @@ __device__ void gpu_calc_energrad(
 		ATOMICADDI32(&gradient[atom2_id].z, priv_intra_gradient_z);
 #endif
 	} // End contributor_counter for-loop (INTRAMOLECULAR ENERGY)
-	__threadfence();
 	__syncthreads();
 
 	// Transform gradients_inter_{x|y|z} 
@@ -681,7 +677,6 @@ __device__ void gpu_calc_energrad(
 		printf("gradient_z:%f\n", gradient_genotype [2]);
 		#endif
 	}
-	__threadfence();
 	__syncthreads();
 
 	// ------------------------------------------
@@ -876,7 +871,6 @@ __device__ void gpu_calc_energrad(
 		printf("%-13.6f %-13.6f %-13.6f\n", gradient_genotype[3], gradient_genotype[4], gradient_genotype[5]);
 		#endif
 	}
-	__threadfence();
 	__syncthreads();
 
 	// ------------------------------------------
@@ -938,15 +932,13 @@ __device__ void gpu_calc_energrad(
 		// - this works because a * (a_1 + a_2 + ... + a_n) = a*a_1 + a*a_2 + ... + a*a_n
 		ATOMICADDI32(&gradient_genotype[rotbond_id+6], lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * torque_on_axis * DEG_TO_RAD)))); /*(M_PI / 180.0f)*/;
 	}
-	__threadfence();
 	__syncthreads();
 
 	for (uint32_t gene_cnt = threadIdx.x;
 	              gene_cnt < cData.dockpars.num_of_genes;
 	              gene_cnt+= blockDim.x) {
 		fgradient_genotype[gene_cnt] = ONEOVERTERMSCALE * (float)gradient_genotype[gene_cnt];
 	}
-	__threadfence();
 	__syncthreads();
 
 	#if defined (CONVERT_INTO_ANGSTROM_RADIAN)
@@ -956,7 +948,6 @@ __device__ void gpu_calc_energrad(
 	{
 		fgradient_genotype[gene_cnt] *= cData.dockpars.grid_spacing * cData.dockpars.grid_spacing * SCFACTOR_ANGSTROM_RADIAN;
 	}
-	__threadfence();
 	__syncthreads();
 	#endif
 }
diff --git a/cuda/calcenergy.cu b/cuda/calcenergy.cu
@@ -163,7 +163,6 @@ __device__ void gpu_calc_energy(
 	uint g2 = cData.dockpars.gridsize_x_times_y;
 	uint g3 = cData.dockpars.gridsize_x_times_y_times_z;
 
-	__threadfence();
 	__syncthreads();
 
 	// ================================================
@@ -226,8 +225,7 @@ __device__ void gpu_calc_energy(
 			calc_coords[atom_id].z = qt.z + rotation_movingvec.z;
 		} // End if-statement not dummy rotation
 
-		__threadfence();
-		__syncthreads();
+			__syncthreads();
 
 	} // End rotation_counter for-loop
 

diff --git a/cuda/kernel3.cu b/cuda/kernel3.cu
@@ -51,11 +51,12 @@ gpu_perform_LS_kernel(
 	__shared__ int   iteration_cnt;
 	__shared__ int   evaluation_cnt;
 
-
 	__shared__ float offspring_energy;
 	__shared__ float sFloatAccumulator;
 	__shared__ int entity_id;
-	extern __shared__ float sFloatBuff[];
+
+	volatile __shared__ float sFloatBuff[3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH];
+
 	float candidate_energy;
 	int run_id;
 	// Ligand-atom position and partial energies
@@ -90,7 +91,6 @@ gpu_perform_LS_kernel(
 		iteration_cnt = 0;
 		evaluation_cnt = 0;
 	}
-	__threadfence();
 	__syncthreads();
 
 	size_t offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM;
@@ -101,7 +101,6 @@ gpu_perform_LS_kernel(
 		offspring_genotype[gene_counter] = pMem_conformations_next[offset + gene_counter];
 		genotype_bias[gene_counter] = 0.0f;
 	}
-	__threadfence();
 	__syncthreads();
 
 
@@ -155,7 +154,6 @@ gpu_perform_LS_kernel(
 			                                   genotype_bias[gene_counter];
 		}
 		// Evaluating candidate
-		__threadfence();
 		__syncthreads();
 
 		// =================================================================
@@ -170,7 +168,6 @@ gpu_perform_LS_kernel(
 		if (threadIdx.x == 0) {
 			evaluation_cnt++;
 		}
-		__threadfence();
 		__syncthreads();
 
 		if (candidate_energy < offspring_energy)	// If candidate is better, success
@@ -187,7 +184,6 @@ gpu_perform_LS_kernel(
 
 			// Work-item 0 will overwrite the shared variables
 			// used in the previous if condition
-			__threadfence();
 			__syncthreads();
 
 			if (threadIdx.x == 0)
@@ -210,7 +206,6 @@ gpu_perform_LS_kernel(
 			}
 
 			// Evaluating candidate
-			__threadfence();
 			__syncthreads();
 
 			// =================================================================
@@ -230,7 +225,6 @@ gpu_perform_LS_kernel(
 				printf("%-18s [%-5s]---{%-5s}   [%-10.8f]---{%-10.8f}\n", "-ENERGY-KERNEL3-", "GRIDS", "INTRA", partial_interE[0], partial_intraE[0]);
 				#endif
 			}
-			__threadfence();
 			__syncthreads();
 
 			if (candidate_energy < offspring_energy) // If candidate is better, success
@@ -247,8 +241,7 @@ gpu_perform_LS_kernel(
 
 				// Work-item 0 will overwrite the shared variables
 				// used in the previous if condition
-				__threadfence();
-				__syncthreads();
+							__syncthreads();
 
 				if (threadIdx.x == 0)
 				{
@@ -290,7 +283,6 @@ gpu_perform_LS_kernel(
 					cons_fail = 0;
 				}
 		}
-		__threadfence();
 		__syncthreads();
 	}
 
@@ -321,8 +313,7 @@ void gpu_perform_LS(
                     float*   pMem_energies_next
                    )
 {
-	size_t sz_shared = (3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
-	gpu_perform_LS_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
+	gpu_perform_LS_kernel<<<blocks, threads>>>(pMem_conformations_next, pMem_energies_next);
 	LAUNCHERROR("gpu_perform_LS_kernel");
 #if 0
 	cudaError_t status;

diff --git a/cuda/kernel_ad.cu b/cuda/kernel_ad.cu
@@ -84,7 +84,8 @@ gpu_gradient_minAD_kernel(
 	__shared__ int entity_id;
 	__shared__ float best_energy;
 	__shared__ float sFloatAccumulator;
-	extern __shared__ float sFloatBuff[];
+
+	volatile __shared__ float sFloatBuff[6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH];
 
 	// Ligand-atom position and partial energies
 	float3* calc_coords = (float3*)sFloatBuff;
@@ -135,7 +136,6 @@ gpu_gradient_minAD_kernel(
 		printf("%20s %.6f\n", "initial energy: ", energy);
 		#endif
 	}
-	__threadfence();
 	__syncthreads();
 	energy = pMem_energies_next[run_id * cData.dockpars.pop_size + entity_id];
 
@@ -160,7 +160,6 @@ gpu_gradient_minAD_kernel(
 	// E.g. in steepest descent "delta" is -1.0 * stepsize * gradient
 
 	// Asynchronous copy should be finished by here
-	__threadfence();
 	__syncthreads();
 
 	// Initializing vectors
@@ -214,7 +213,6 @@ gpu_gradient_minAD_kernel(
 		// =============================================================
 		// =============================================================
 		// Calculating energy & gradient
-		__threadfence();
 		__syncthreads();
 
 		gpu_calc_energrad(
@@ -272,7 +270,6 @@ gpu_gradient_minAD_kernel(
 			printf("\n");
 			#endif
 		}
-		__threadfence();
 		__syncthreads();
 		#endif // DEBUG_ENERGY_ADADELTA
 
@@ -297,7 +294,6 @@ gpu_gradient_minAD_kernel(
 			// Applying update
 			genotype[i] += delta;
 		}
-		__threadfence();
 		__syncthreads();
 
 		#if defined (DEBUG_SQDELTA_ADADELTA)
@@ -310,7 +306,6 @@ gpu_gradient_minAD_kernel(
 				printf("%13u %20.6f %15.6f %15.6f %15.6f\n", i, square_gradient[i], delta[i], square_delta[i], genotype[i]);
 			}
 		}
-		__threadfence();
 		__syncthreads();
 		#endif
 
@@ -356,8 +351,7 @@ gpu_gradient_minAD_kernel(
 			}
 #endif
 		}
-		__threadfence();
-		__syncthreads(); // making sure that iteration_cnt is up-to-date
+			__syncthreads(); // making sure that iteration_cnt is up-to-date
 #ifdef ADADELTA_AUTOSTOP
 	} while ((iteration_cnt < cData.dockpars.max_num_of_iters)  && (rho > 0.01f));
 #else
@@ -375,7 +369,6 @@ gpu_gradient_minAD_kernel(
 	}
 
 	// Updating old offspring in population
-	__threadfence();
 	__syncthreads();
 
 	offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM;
@@ -407,8 +400,7 @@ void gpu_gradient_minAD(
                         float*   pMem_energies_next
                        )
 {
-	size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
-	gpu_gradient_minAD_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
+	gpu_gradient_minAD_kernel<<<blocks, threads>>>(pMem_conformations_next, pMem_energies_next);
 	LAUNCHERROR("gpu_gradient_minAD_kernel");
 #if 0
 	cudaError_t status;