Skip to content

Commit

Permalink
Merge pull request #151 from atillack/cuda_optimization
Browse files Browse the repository at this point in the history
Cuda optimizations and other loose ends
  • Loading branch information
atillack authored Sep 1, 2021
2 parents 99f602c + f8fd68f commit 0fe820f
Show file tree
Hide file tree
Showing 10 changed files with 25 additions and 139 deletions.
79 changes: 0 additions & 79 deletions cuda/auxiliary_genetic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,82 +77,3 @@ inline __device__ void map_angle(float& angle)
}
}

#if 0
// -------------------------------------------------------
//
// -------------------------------------------------------
void gpu_perform_elitist_selection(
int dockpars_pop_size,
__global float* restrict dockpars_energies_current,
__global float* restrict dockpars_energies_next,
__global int* restrict dockpars_evals_of_new_entities,
int dockpars_num_of_genes,
__global float* restrict dockpars_conformations_next,
__global const float* restrict dockpars_conformations_current,
__local float* best_energies,
__local int* best_IDs,
__local int* best_ID
)
// The GPU device function performs elitist selection,
// that is, it looks for the best entity in conformations_current and
// energies_current of the run that corresponds to the block ID,
// and copies it to the place of the first entity in
// conformations_next and energies_next.
{
int entity_counter;
int gene_counter;
float best_energy;

if (get_local_id(0) < dockpars_pop_size) {
best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+get_local_id(0)];
best_IDs[get_local_id(0)] = get_local_id(0);
}

for (entity_counter = NUM_OF_THREADS_PER_BLOCK+get_local_id(0);
entity_counter < dockpars_pop_size;
entity_counter+= NUM_OF_THREADS_PER_BLOCK)
{
if (dockpars_energies_current[get_group_id(0)+entity_counter] < best_energies[get_local_id(0)]) {
best_energies[get_local_id(0)] = dockpars_energies_current[get_group_id(0)+entity_counter];
best_IDs[get_local_id(0)] = entity_counter;
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// This could be implemented with a tree-like structure
// which may be slightly faster
if (get_local_id(0) == 0)
{
best_energy = best_energies[0];
best_ID[0] = best_IDs[0];

for (entity_counter = 1;
entity_counter < NUM_OF_THREADS_PER_BLOCK;
entity_counter++)
{
if ((best_energies[entity_counter] < best_energy) && (entity_counter < dockpars_pop_size)) {
best_energy = best_energies[entity_counter];
best_ID[0] = best_IDs[entity_counter];
}
}

// Setting energy value of new entity
dockpars_energies_next[get_group_id(0)] = best_energy;

// Zero (0) evals were performed for entity selected with elitism (since it was copied only)
dockpars_evals_of_new_entities[get_group_id(0)] = 0;
}

// "best_id" stores the id of the best entity in the population,
// Copying genotype and energy value to the first entity of new population
barrier(CLK_LOCAL_MEM_FENCE);

for (gene_counter = get_local_id(0);
gene_counter < dockpars_num_of_genes;
gene_counter+= NUM_OF_THREADS_PER_BLOCK)
{
dockpars_conformations_next[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0)+gene_counter] = dockpars_conformations_current[GENOTYPE_LENGTH_IN_GLOBMEM*get_group_id(0) + GENOTYPE_LENGTH_IN_GLOBMEM*best_ID[0]+gene_counter];
}
}
#endif
11 changes: 1 addition & 10 deletions cuda/calcMergeEneGra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ __device__ void gpu_calc_energrad(
uint32_t g2 = cData.dockpars.gridsize_x_times_y;
uint32_t g3 = cData.dockpars.gridsize_x_times_y_times_z;

__threadfence();
__syncthreads();

// ================================================
Expand Down Expand Up @@ -183,8 +182,7 @@ __device__ void gpu_calc_energrad(
calc_coords[atom_id].z = qt.z + rotation_movingvec.z;

} // End if-statement not dummy rotation
__threadfence();
__syncthreads();
__syncthreads();
} // End rotation_counter for-loop

// ================================================
Expand Down Expand Up @@ -409,7 +407,6 @@ __device__ void gpu_calc_energrad(
gradient[atom_id].z += lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * gz)));
#endif
} // End atom_id for-loop (INTERMOLECULAR ENERGY)
__threadfence();
__syncthreads();

// Inter- and intra-molecular energy calculation
Expand Down Expand Up @@ -595,7 +592,6 @@ __device__ void gpu_calc_energrad(
ATOMICADDI32(&gradient[atom2_id].z, priv_intra_gradient_z);
#endif
} // End contributor_counter for-loop (INTRAMOLECULAR ENERGY)
__threadfence();
__syncthreads();

// Transform gradients_inter_{x|y|z}
Expand Down Expand Up @@ -681,7 +677,6 @@ __device__ void gpu_calc_energrad(
printf("gradient_z:%f\n", gradient_genotype [2]);
#endif
}
__threadfence();
__syncthreads();

// ------------------------------------------
Expand Down Expand Up @@ -876,7 +871,6 @@ __device__ void gpu_calc_energrad(
printf("%-13.6f %-13.6f %-13.6f\n", gradient_genotype[3], gradient_genotype[4], gradient_genotype[5]);
#endif
}
__threadfence();
__syncthreads();

// ------------------------------------------
Expand Down Expand Up @@ -938,15 +932,13 @@ __device__ void gpu_calc_energrad(
// - this works because a * (a_1 + a_2 + ... + a_n) = a*a_1 + a*a_2 + ... + a*a_n
ATOMICADDI32(&gradient_genotype[rotbond_id+6], lrintf(fminf(MAXTERM, fmaxf(-MAXTERM, TERMSCALE * torque_on_axis * DEG_TO_RAD)))); /*(M_PI / 180.0f)*/;
}
__threadfence();
__syncthreads();

for (uint32_t gene_cnt = threadIdx.x;
gene_cnt < cData.dockpars.num_of_genes;
gene_cnt+= blockDim.x) {
fgradient_genotype[gene_cnt] = ONEOVERTERMSCALE * (float)gradient_genotype[gene_cnt];
}
__threadfence();
__syncthreads();

#if defined (CONVERT_INTO_ANGSTROM_RADIAN)
Expand All @@ -956,7 +948,6 @@ __device__ void gpu_calc_energrad(
{
fgradient_genotype[gene_cnt] *= cData.dockpars.grid_spacing * cData.dockpars.grid_spacing * SCFACTOR_ANGSTROM_RADIAN;
}
__threadfence();
__syncthreads();
#endif
}
4 changes: 1 addition & 3 deletions cuda/calcenergy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ __device__ void gpu_calc_energy(
uint g2 = cData.dockpars.gridsize_x_times_y;
uint g3 = cData.dockpars.gridsize_x_times_y_times_z;

__threadfence();
__syncthreads();

// ================================================
Expand Down Expand Up @@ -226,8 +225,7 @@ __device__ void gpu_calc_energy(
calc_coords[atom_id].z = qt.z + rotation_movingvec.z;
} // End if-statement not dummy rotation

__threadfence();
__syncthreads();
__syncthreads();

} // End rotation_counter for-loop

Expand Down
19 changes: 5 additions & 14 deletions cuda/kernel3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,12 @@ gpu_perform_LS_kernel(
__shared__ int iteration_cnt;
__shared__ int evaluation_cnt;


__shared__ float offspring_energy;
__shared__ float sFloatAccumulator;
__shared__ int entity_id;
extern __shared__ float sFloatBuff[];

volatile __shared__ float sFloatBuff[3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH];

float candidate_energy;
int run_id;
// Ligand-atom position and partial energies
Expand Down Expand Up @@ -90,7 +91,6 @@ gpu_perform_LS_kernel(
iteration_cnt = 0;
evaluation_cnt = 0;
}
__threadfence();
__syncthreads();

size_t offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM;
Expand All @@ -101,7 +101,6 @@ gpu_perform_LS_kernel(
offspring_genotype[gene_counter] = pMem_conformations_next[offset + gene_counter];
genotype_bias[gene_counter] = 0.0f;
}
__threadfence();
__syncthreads();


Expand Down Expand Up @@ -155,7 +154,6 @@ gpu_perform_LS_kernel(
genotype_bias[gene_counter];
}
// Evaluating candidate
__threadfence();
__syncthreads();

// =================================================================
Expand All @@ -170,7 +168,6 @@ gpu_perform_LS_kernel(
if (threadIdx.x == 0) {
evaluation_cnt++;
}
__threadfence();
__syncthreads();

if (candidate_energy < offspring_energy) // If candidate is better, success
Expand All @@ -187,7 +184,6 @@ gpu_perform_LS_kernel(

// Work-item 0 will overwrite the shared variables
// used in the previous if condition
__threadfence();
__syncthreads();

if (threadIdx.x == 0)
Expand All @@ -210,7 +206,6 @@ gpu_perform_LS_kernel(
}

// Evaluating candidate
__threadfence();
__syncthreads();

// =================================================================
Expand All @@ -230,7 +225,6 @@ gpu_perform_LS_kernel(
printf("%-18s [%-5s]---{%-5s} [%-10.8f]---{%-10.8f}\n", "-ENERGY-KERNEL3-", "GRIDS", "INTRA", partial_interE[0], partial_intraE[0]);
#endif
}
__threadfence();
__syncthreads();

if (candidate_energy < offspring_energy) // If candidate is better, success
Expand All @@ -247,8 +241,7 @@ gpu_perform_LS_kernel(

// Work-item 0 will overwrite the shared variables
// used in the previous if condition
__threadfence();
__syncthreads();
__syncthreads();

if (threadIdx.x == 0)
{
Expand Down Expand Up @@ -290,7 +283,6 @@ gpu_perform_LS_kernel(
cons_fail = 0;
}
}
__threadfence();
__syncthreads();
}

Expand Down Expand Up @@ -321,8 +313,7 @@ void gpu_perform_LS(
float* pMem_energies_next
)
{
size_t sz_shared = (3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
gpu_perform_LS_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
gpu_perform_LS_kernel<<<blocks, threads>>>(pMem_conformations_next, pMem_energies_next);
LAUNCHERROR("gpu_perform_LS_kernel");
#if 0
cudaError_t status;
Expand Down
16 changes: 4 additions & 12 deletions cuda/kernel_ad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ gpu_gradient_minAD_kernel(
__shared__ int entity_id;
__shared__ float best_energy;
__shared__ float sFloatAccumulator;
extern __shared__ float sFloatBuff[];

volatile __shared__ float sFloatBuff[6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH];

// Ligand-atom position and partial energies
float3* calc_coords = (float3*)sFloatBuff;
Expand Down Expand Up @@ -135,7 +136,6 @@ gpu_gradient_minAD_kernel(
printf("%20s %.6f\n", "initial energy: ", energy);
#endif
}
__threadfence();
__syncthreads();
energy = pMem_energies_next[run_id * cData.dockpars.pop_size + entity_id];

Expand All @@ -160,7 +160,6 @@ gpu_gradient_minAD_kernel(
// E.g. in steepest descent "delta" is -1.0 * stepsize * gradient

// Asynchronous copy should be finished by here
__threadfence();
__syncthreads();

// Initializing vectors
Expand Down Expand Up @@ -214,7 +213,6 @@ gpu_gradient_minAD_kernel(
// =============================================================
// =============================================================
// Calculating energy & gradient
__threadfence();
__syncthreads();

gpu_calc_energrad(
Expand Down Expand Up @@ -272,7 +270,6 @@ gpu_gradient_minAD_kernel(
printf("\n");
#endif
}
__threadfence();
__syncthreads();
#endif // DEBUG_ENERGY_ADADELTA

Expand All @@ -297,7 +294,6 @@ gpu_gradient_minAD_kernel(
// Applying update
genotype[i] += delta;
}
__threadfence();
__syncthreads();

#if defined (DEBUG_SQDELTA_ADADELTA)
Expand All @@ -310,7 +306,6 @@ gpu_gradient_minAD_kernel(
printf("%13u %20.6f %15.6f %15.6f %15.6f\n", i, square_gradient[i], delta[i], square_delta[i], genotype[i]);
}
}
__threadfence();
__syncthreads();
#endif

Expand Down Expand Up @@ -356,8 +351,7 @@ gpu_gradient_minAD_kernel(
}
#endif
}
__threadfence();
__syncthreads(); // making sure that iteration_cnt is up-to-date
__syncthreads(); // making sure that iteration_cnt is up-to-date
#ifdef ADADELTA_AUTOSTOP
} while ((iteration_cnt < cData.dockpars.max_num_of_iters) && (rho > 0.01f));
#else
Expand All @@ -375,7 +369,6 @@ gpu_gradient_minAD_kernel(
}

// Updating old offspring in population
__threadfence();
__syncthreads();

offset = (run_id * cData.dockpars.pop_size + entity_id) * GENOTYPE_LENGTH_IN_GLOBMEM;
Expand Down Expand Up @@ -407,8 +400,7 @@ void gpu_gradient_minAD(
float* pMem_energies_next
)
{
size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
gpu_gradient_minAD_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
gpu_gradient_minAD_kernel<<<blocks, threads>>>(pMem_conformations_next, pMem_energies_next);
LAUNCHERROR("gpu_gradient_minAD_kernel");
#if 0
cudaError_t status;
Expand Down
Loading

0 comments on commit 0fe820f

Please sign in to comment.