diff --git a/Makefile.Cuda b/Makefile.Cuda
index 964dcf16..3356cf8d 100644
--- a/Makefile.Cuda
+++ b/Makefile.Cuda
@@ -18,8 +18,6 @@ UNAME := $(shell uname)
 TARGETS = 52 60 61 70
 CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target))
 
-$(shell ./link_cuda.sh)
-
 ifeq ($(DEVICE), CPU)
 	DEV =-DCPU_DEVICE
 else ifeq ($(DEVICE), GPU)
@@ -37,16 +35,14 @@ KCMN_DIR=$(COMMON_DIR)
 BIN_DIR=./bin
 LIB_CUDA = kernels.o -lcurand -lcudart 
 
-
-# Host sources
-HOST_SRC=$(wildcard $(HOST_SRC_DIR)/*.cpp)
-SRC=$(HOST_SRC)
+TARGET := autodock
+TOOL_TARGET := adgpu_analysis
 
 IFLAGS=-I$(COMMON_DIR) -I$(HOST_INC_DIR) -I$(GPU_INCLUDE_PATH) -I$(KRNL_DIR)
 LFLAGS=-L$(GPU_LIBRARY_PATH) -Wl,-rpath=$(GPU_LIBRARY_PATH):$(CPU_LIBRARY_PATH)
 CFLAGS=-std=c++11 $(IFLAGS) $(LFLAGS)
+TOOL_CFLAGS=-std=c++11 -I$(COMMON_DIR) -I$(HOST_INC_DIR)
 
-TARGET := autodock
 ifeq ($(DEVICE), CPU)
 	TARGET:=$(TARGET)_cpu
 else ifeq ($(DEVICE), GPU)
@@ -96,8 +92,6 @@ endif
 CONFIG=RELEASE
 #CONFIG=FDEBUG
 
-
-
 ifeq ($(CONFIG),FDEBUG)
 	OPT =-O0 -g3 -Wall -DDOCK_DEBUG
 	CUDA_FLAGS = -G -use_fast_math --ptxas-options="-v" $(CUDA_TARGETS) -std=c++11
@@ -123,20 +117,22 @@ else
 endif
 # ------------------------------------------------------
 
-all: odock
+all: otool odock
 
 check-env-dev:
 	@if test -z "$$DEVICE"; then \
-		echo "DEVICE is undefined"; \
+		echo "Please set DEVICE to either CPU, GPU, CUDA, or OCLGPU to build docking software."; \
 		exit 1; \
 	else \
 		if [ "$$DEVICE" = "CPU" ]; then \
-			echo "DEVICE is set to $$DEVICE"; \
+			echo "DEVICE is set to $$DEVICE which is not a valid Cuda device."; \
+			exit 1; \
 		else \
 			if [ "$$DEVICE" = "GPU" ]; then \
 				echo "DEVICE is set to $$DEVICE"; \
 			else \
-				echo "DEVICE value is invalid. Set DEVICE to either CPU or GPU"; \
+				echo "DEVICE value is invalid. Please set DEVICE to either CPU, GPU, CUDA, or OCLGPU"; \
+				exit 1; \
 			fi; \
 		fi; \
 	fi; \
@@ -176,15 +172,32 @@ check-env-all: check-env-dev check-env-cpu check-env-gpu
 GIT_VERSION := $(shell ./version_string.sh)
 
 CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
+TOOL_CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
 
 # ------------------------------------------------------
 
+link-code:
+	ln -sf performdocking.h.Cuda $(HOST_INC_DIR)/performdocking.h
+	ln -sf performdocking.cpp.Cuda $(HOST_SRC_DIR)/performdocking.cpp
+
+unlink-code:
+	rm -f $(HOST_INC_DIR)/performdocking.h $(HOST_SRC_DIR)/performdocking.cpp
+
 kernels: $(KERNEL_SRC)
 	$(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu
 
-odock: check-env-all kernels $(SRC)
+otool: unlink-code
+	@echo "Building" $(TOOL_TARGET) "..."
+	$(CPP) \
+	$(shell ls $(HOST_SRC_DIR)/*.cpp) \
+	$(TOOL_CFLAGS) \
+	-o$(BIN_DIR)/$(TOOL_TARGET) \
+	$(PIPELINE) $(OPT) -DTOOLMODE $(REP)
+
+odock: check-env-all kernels link-code
+	@echo "Building" $(TARGET) "..."
 	$(CPP) \
-	$(SRC) \
+	$(shell ls $(HOST_SRC_DIR)/*.cpp) \
 	$(CFLAGS) \
 	$(LIB_CUDA) \
 	-o$(BIN_DIR)/$(TARGET) \
diff --git a/Makefile.OpenCL b/Makefile.OpenCL
index 910e3832..365e39c3 100644
--- a/Makefile.OpenCL
+++ b/Makefile.OpenCL
@@ -15,8 +15,6 @@ CPP = g++
 LIB_OPENCL = -lOpenCL
 UNAME := $(shell uname)
 
-$(shell ./link_opencl.sh)
-
 ifeq ($(UNAME), Darwin)
 # In case ScoreP (for profiling/tracing) is used,
 # need to link to a *.dylib for instrumentation
@@ -57,14 +55,16 @@ KRNL_DIR=./device
 KCMN_DIR=$(COMMON_DIR)
 BIN_DIR=./bin
 
-# Host sources
-OCL_SRC=$(wildcard $(OCL_SRC_DIR)/*.cpp)
-HOST_SRC=$(wildcard $(HOST_SRC_DIR)/*.cpp)
-SRC=$(OCL_SRC) $(HOST_SRC)
+TARGET := autodock
+TOOL_TARGET := adgpu_analysis
 
 IFLAGS=-I$(COMMON_DIR) -I$(OCL_INC_DIR) -I$(HOST_INC_DIR) -I$(KRNL_DIR) -I$(OCLA_INC_PATH)
 LFLAGS=-L$(OCLA_LIB_PATH)
 CFLAGS=-std=c++11 $(IFLAGS) $(LFLAGS)
+TOOL_CFLAGS=-std=c++11 -I$(COMMON_DIR) -I$(HOST_INC_DIR)
+
+# Host sources
+OCL_SRC=$(wildcard $(OCL_SRC_DIR)/*.cpp)
 
 # Device sources
 KRNL_MAIN=calcenergy.cl
@@ -81,7 +81,6 @@ K_NAMES=-DK1=$(K1_NAME) -DK2=$(K2_NAME) -DK3=$(K3_NAME) -DK4=$(K4_NAME) -DK5=$(K
 # Kernel flags
 KFLAGS=-DKRNL_SOURCE=$(KRNL_DIR)/$(KRNL_MAIN) -DKRNL_DIRECTORY=$(KRNL_DIR) -DKCMN_DIRECTORY=$(KCMN_DIR) $(K_NAMES)
 
-TARGET := autodock
 ifeq ($(DEVICE), CPU)
 	TARGET:=$(TARGET)_cpu
 else ifeq ($(DEVICE), GPU)
@@ -183,11 +182,11 @@ else
 endif
 # ------------------------------------------------------
 
-all: odock
+all: otool odock
 
 check-env-dev:
 	@if test -z "$$DEVICE"; then \
-		echo "DEVICE is undefined"; \
+		echo "Please set DEVICE to either CPU, GPU, CUDA, or OCLGPU to build docking software."; \
 		exit 1; \
 	else \
 		if [ "$$DEVICE" = "CPU" ]; then \
@@ -196,7 +195,8 @@ check-env-dev:
 			if [ "$$DEVICE" = "GPU" ]; then \
 				echo "DEVICE is set to $$DEVICE"; \
 			else \
-				echo "DEVICE value is invalid. Set DEVICE to either CPU or GPU"; \
+				echo "DEVICE value is invalid. Please set DEVICE to either CPU, GPU, CUDA, or OCLGPU"; \
+				exit 1; \
 			fi; \
 		fi; \
 	fi; \
@@ -236,15 +236,36 @@ check-env-all: check-env-dev check-env-cpu check-env-gpu
 GIT_VERSION := $(shell ./version_string.sh)
 
 CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
+TOOL_CFLAGS+=-DVERSION=\"$(GIT_VERSION)\"
 
 # ------------------------------------------------------
 
+# Tool host sources
+
+# Host sources
+
+link-code:
+	ln -sf performdocking.h.OpenCL $(HOST_INC_DIR)/performdocking.h
+	ln -sf performdocking.cpp.OpenCL $(HOST_SRC_DIR)/performdocking.cpp
+
+unlink-code:
+	rm -f $(HOST_INC_DIR)/performdocking.h $(HOST_SRC_DIR)/performdocking.cpp
+
 stringify:
 	./stringify_ocl_krnls.sh
 
-odock: check-env-all stringify $(SRC)
+otool: unlink-code
+	@echo "Building" $(TOOL_TARGET) "..."
+	$(CPP) \
+	$(shell ls $(HOST_SRC_DIR)/*.cpp) \
+	$(TOOL_CFLAGS) \
+	-o$(BIN_DIR)/$(TOOL_TARGET) \
+	$(PIPELINE) $(OPT) -DTOOLMODE $(REP)
+
+odock: check-env-all stringify link-code
+	@echo "Building" $(TARGET) "..."
 	$(CPP) \
-	$(SRC) \
+	$(OCL_SRC) $(shell ls $(HOST_SRC_DIR)/*.cpp) \
 	$(CFLAGS) \
 	$(LIB_OPENCL) \
 	-o$(BIN_DIR)/$(TARGET) \
diff --git a/README.md b/README.md
index 32f27425..69968382 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,6 @@ By default the output log file is written in the current working folder. Example
 |--smooth           |   | Smoothing parameter for vdW interactions              | 0.5 (Å)          |
 |--elecmindist      |   | Min. electrostatic potential distance (w/ dpf: 0.5 Å) | 0.01 (Å)         |
 |--modqp            |   | Use modified QASP from VirtualDrug or AD4 original    | 0 (no, use AD4)  |
-|--cgmaps           |   | Use individual maps for CG-G0 instead of the same one | 0 (no, same map) |
 
 Autostop is ON by default since v1.4. The collective distribution of scores among all LGA populations
 is tested for convergence every `<asfreq>` generations, and docking is stopped if the top-scored poses
diff --git a/common/calcenergy_basic.h b/common/calcenergy_basic.h
index 57cc78b4..2045f7bb 100644
--- a/common/calcenergy_basic.h
+++ b/common/calcenergy_basic.h
@@ -43,38 +43,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 // WARNING: it is supposed that unsigned int is 32 bit long
 #define MAX_UINT                4294967296.0f
 
-// Macro for capturing grid values
-	// Original
-	#define GETGRIDVALUE(mempoi,gridsize_x,gridsize_y,gridsize_z,t,z,y,x)   *(mempoi + gridsize_x*(y + gridsize_y*(z + gridsize_z*t)) + x)
-
-	// Optimization 1
-	// #define GETGRIDVALUE_OPT(mempoi,gridsize_x,gridsize_y,mul_tmp,z,y,x)   *(mempoi + gridsize_x*(y + gridsize_y*(z + mul_tmp)) + x)
-
-	// Optimization 2
-	// Implemented directly in the kernel code: calcenergy_fourkernels_intel.cl
-
-typedef enum
-{
-	idx_000 = 0,
-	idx_010 = 1,
-	idx_001 = 2,
-	idx_011 = 3,
-	idx_100 = 4,
-	idx_110 = 5,
-	idx_101 = 6,
-	idx_111 = 7
-} indices;
-
-// Macro for trilinear interpolation
-#define TRILININTERPOL(cube, weights) (cube[idx_000]*weights[idx_000] + \
-                                       cube[idx_010]*weights[idx_010] + \
-                                       cube[idx_001]*weights[idx_001] + \
-                                       cube[idx_011]*weights[idx_011] + \
-                                       cube[idx_100]*weights[idx_100] + \
-                                       cube[idx_110]*weights[idx_110] + \
-                                       cube[idx_101]*weights[idx_101] + \
-                                       cube[idx_111]*weights[idx_111])
-
 // Sticking to array boundaries
 #define stick_to_bounds(x,a,b) x + (x <= a)*(a-x) + (x >= b)*(b-x)
 
diff --git a/common/defines.h b/common/defines.h
index 01862444..e3d16352 100644
--- a/common/defines.h
+++ b/common/defines.h
@@ -48,6 +48,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 	#define NUM_OF_THREADS_PER_BLOCK 16
 #endif
 
+typedef enum
+{
+	idx_000 = 0,
+	idx_010 = 1,
+	idx_001 = 2,
+	idx_011 = 3,
+	idx_100 = 4,
+	idx_110 = 5,
+	idx_101 = 6,
+	idx_111 = 7
+} indices;
+
 enum {C=0,N=1,O=2,H=3,XX=4,P=5,S=6};  // see "bond_index" in the "AD4.1_bound.dat" or "AD4_parameters.dat" file.
 #define NUM_ENUM_ATOMTYPES 7 // this should be the length of the enumerated atom types above
 
diff --git a/cuda/GpuData.h b/cuda/GpuData.h
index 57b5895e..b64bc0c1 100644
--- a/cuda/GpuData.h
+++ b/cuda/GpuData.h
@@ -111,7 +111,8 @@ typedef struct
 
 struct GpuData {
 	int                             devnum;
-	int                             preload_gridsize;
+	int                             devid;
+	int                             preallocated_gridsize;
 	GpuDockparameters               dockpars;
 	
 	// Consolidated constants and memory pointers to reduce kernel launch overhead
@@ -146,6 +147,8 @@ struct GpuTempData {
 	int*        pMem_evals_of_new_entities;
 	int*        pMem_gpu_evals_of_runs;
 	uint32_t*   pMem_prng_states;
+	char*       device_name;
+	bool        device_busy;
 };
 #endif
 
diff --git a/cuda/kernel3.cu b/cuda/kernel3.cu
index 8c147e33..65f2975e 100644
--- a/cuda/kernel3.cu
+++ b/cuda/kernel3.cu
@@ -62,10 +62,10 @@ gpu_perform_LS_kernel(
 	float3* calc_coords = (float3*)sFloatBuff;
 
 	// Genotype pointers
-	float* genotype_candidate = (float*)(calc_coords + cData.dockpars.num_of_atoms);
-	float* genotype_deviate = (float*)(genotype_candidate + cData.dockpars.num_of_genes);
-	float* genotype_bias = (float*)(genotype_deviate + cData.dockpars.num_of_genes);
-	float* offspring_genotype = (float*)(genotype_bias + cData.dockpars.num_of_genes);
+	float* genotype_candidate = (float*)(calc_coords + MAX_NUM_OF_ATOMS);
+	float* genotype_deviate = (float*)(genotype_candidate + ACTUAL_GENOTYPE_LENGTH);
+	float* genotype_bias = (float*)(genotype_deviate + ACTUAL_GENOTYPE_LENGTH);
+	float* offspring_genotype = (float*)(genotype_bias + ACTUAL_GENOTYPE_LENGTH);
 
 	// Determining run ID and entity ID
 	// Initializing offspring genotype
@@ -321,7 +321,7 @@ void gpu_perform_LS(
                     float*   pMem_energies_next
                    )
 {
-	size_t sz_shared = (3 * cpuData.dockpars.num_of_atoms + 4 * cpuData.dockpars.num_of_genes) * sizeof(float);
+	size_t sz_shared = (3 * MAX_NUM_OF_ATOMS + 4 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
 	gpu_perform_LS_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
 	LAUNCHERROR("gpu_perform_LS_kernel");
 #if 0
diff --git a/cuda/kernel_ad.cu b/cuda/kernel_ad.cu
index d8d5dfcf..631ae566 100644
--- a/cuda/kernel_ad.cu
+++ b/cuda/kernel_ad.cu
@@ -92,22 +92,22 @@ gpu_gradient_minAD_kernel(
 	// Gradient of the intermolecular energy per each ligand atom
 	// Also used to store the accummulated gradient per each ligand atom
 #ifdef FLOAT_GRADIENTS
-	float3* cartesian_gradient = (float3*)(calc_coords + cData.dockpars.num_of_atoms);
+	float3* cartesian_gradient = (float3*)(calc_coords + MAX_NUM_OF_ATOMS);
 #else
-	int3* cartesian_gradient = (int3*)(calc_coords + cData.dockpars.num_of_atoms);
+	int3* cartesian_gradient = (int3*)(calc_coords + MAX_NUM_OF_ATOMS);
 #endif
 	// Genotype pointers
-	float* genotype = (float*)(cartesian_gradient + cData.dockpars.num_of_atoms);
-	float* best_genotype = genotype + cData.dockpars.num_of_genes;
+	float* genotype = (float*)(cartesian_gradient + MAX_NUM_OF_ATOMS); // so far used 3*2*MAX_NUM_OF_ATOMS
+	float* best_genotype = genotype + ACTUAL_GENOTYPE_LENGTH;
 
 	// Partial results of the gradient step
-	float* gradient = best_genotype + cData.dockpars.num_of_genes;
+	float* gradient = best_genotype + ACTUAL_GENOTYPE_LENGTH;
 
 	// Squared updates E[dx^2]
-	float* square_delta = gradient + cData.dockpars.num_of_genes;
+	float* square_delta = gradient + ACTUAL_GENOTYPE_LENGTH;
 
 	// Vector for storing squared gradients E[g^2]
-	float* square_gradient = square_delta + cData.dockpars.num_of_genes;
+	float* square_gradient = square_delta + ACTUAL_GENOTYPE_LENGTH; // so far used 5*ACTUAL_GENOTYPE_LENGTH
 
 	// Iteration counter for the minimizer
 	uint32_t iteration_cnt = 0;
@@ -407,7 +407,7 @@ void gpu_gradient_minAD(
                         float*   pMem_energies_next
                        )
 {
-	size_t sz_shared = (6 * cpuData.dockpars.num_of_atoms + 5 * cpuData.dockpars.num_of_genes) * sizeof(float);
+	size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
 	gpu_gradient_minAD_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
 	LAUNCHERROR("gpu_gradient_minAD_kernel");
 #if 0
diff --git a/cuda/kernel_adam.cu b/cuda/kernel_adam.cu
index b0f81b30..11f37b3f 100644
--- a/cuda/kernel_adam.cu
+++ b/cuda/kernel_adam.cu
@@ -85,23 +85,23 @@ gpu_gradient_minAdam_kernel(
 	// Gradient of the intermolecular energy per each ligand atom
 	// Also used to store the accummulated gradient per each ligand atom
 #ifdef FLOAT_GRADIENTS
-	float3* cartesian_gradient = (float3*)(calc_coords + cData.dockpars.num_of_atoms);
+	float3* cartesian_gradient = (float3*)(calc_coords + MAX_NUM_OF_ATOMS);
 #else
-	int3* cartesian_gradient = (int3*)(calc_coords + cData.dockpars.num_of_atoms);
+	int3* cartesian_gradient = (int3*)(calc_coords + MAX_NUM_OF_ATOMS);
 #endif
 
 	// Genotype pointers
-	float* genotype = (float*)(cartesian_gradient + cData.dockpars.num_of_atoms);
-	float* best_genotype = genotype + cData.dockpars.num_of_genes;
+	float* genotype = (float*)(cartesian_gradient + MAX_NUM_OF_ATOMS);
+	float* best_genotype = genotype + ACTUAL_GENOTYPE_LENGTH;
 
 	// Partial results of the gradient step
-	float* gradient = best_genotype + cData.dockpars.num_of_genes;
+	float* gradient = best_genotype + ACTUAL_GENOTYPE_LENGTH;
 
 	// Adam mt parameter
-	float* mt = gradient + cData.dockpars.num_of_genes;
+	float* mt = gradient + ACTUAL_GENOTYPE_LENGTH;
 
 	// Adam vt parameter
-	float* vt = mt + cData.dockpars.num_of_genes;
+	float* vt = mt + ACTUAL_GENOTYPE_LENGTH;
 
 	// Iteration counter for the minimizer
 	uint32_t iteration_cnt = 0;
@@ -410,7 +410,7 @@ void gpu_gradient_minAdam(
                           float* pMem_energies_next
 )
 {
-	size_t sz_shared = (6 * cpuData.dockpars.num_of_atoms + 5 * cpuData.dockpars.num_of_genes) * sizeof(float);
+	size_t sz_shared = (6 * MAX_NUM_OF_ATOMS + 5 * ACTUAL_GENOTYPE_LENGTH) * sizeof(float);
 	gpu_gradient_minAdam_kernel<<<blocks, threads, sz_shared>>>(pMem_conformations_next, pMem_energies_next);
 	LAUNCHERROR("gpu_gradient_minAdam_kernel");
 #if 0
diff --git a/device/GpuData.h b/device/GpuData.h
index 9d5986fe..64a3b36a 100644
--- a/device/GpuData.h
+++ b/device/GpuData.h
@@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 struct GpuData {
 	int devnum;
-	int preload_gridsize;
+	int preallocated_gridsize;
 	// Consolidated constants and memory pointers to reduce kernel launch overhead
 	// dynamic
 	cl_mem mem_interintra_const;
@@ -57,6 +57,8 @@ struct GpuTempData {
 	cl_kernel        kernel6;
 	cl_kernel        kernel7;
 	cl_mem           pMem_fgrids;
+	char*            device_name;
+	bool             device_busy;
 };
 #endif
 
diff --git a/host/inc/autostop.hpp b/host/inc/autostop.hpp
index 7102af13..a3f7a317 100644
--- a/host/inc/autostop.hpp
+++ b/host/inc/autostop.hpp
@@ -30,6 +30,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <cmath>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string>
+#include "miscellaneous.h"
 
 class AutoStop{
 	      bool               first_time;
@@ -51,6 +53,8 @@ class AutoStop{
 	const unsigned int       as_frequency;
 	      float              delta_energy;
 	      float              overall_best_energy;
+	      char*              outbuf;
+	      std::string*       output;
 
 	inline float average(float* average_sd2_N)
 	{
@@ -123,7 +127,8 @@ class AutoStop{
 	         int pop_size_in,
 	         int num_of_runs_in,
 	         float stopstd_in,
-	         int as_frequency_in
+	         int as_frequency_in,
+	         std::string* out_in
 	        )
 		: rolling(4*4, 0), // Initialize to zero
 		  average_sd2_N((pop_size_in+1)*3),
@@ -132,24 +137,33 @@ class AutoStop{
 		  Ntop(pop_size_in),
 		  Ncream(pop_size_in / 10),
 		  stopstd(stopstd_in),
-		  as_frequency(as_frequency_in)
+		  as_frequency(as_frequency_in),
+		  output(out_in)
 	{
 		first_time = true;
 		autostopped = false;
 		threshold = 1<<24;
+		threshold_used = threshold;
 		thres_stddev = threshold;
 		curr_avg = -(1<<24);
 		curr_std = thres_stddev;
 		roll_count = 0;
 		bestN = 1;
 		delta_energy = 2.0 * thres_stddev / Ntop;
+		overall_best_energy = 1<<24;
+		if(output!=NULL) outbuf = (char*)malloc(256*sizeof(char));
+	}
+
+	~AutoStop()
+	{
+		if(output!=NULL) free(outbuf);
 	}
 
 	inline void print_intro(unsigned long num_of_generations, unsigned long num_of_energy_evals)
 	{
-		printf("\nExecuting docking runs, stopping automatically after either reaching %.2f kcal/mol standard deviation of\nthe best molecules of the last 4 * %u generations, %lu generations, or %lu evaluations:\n\n",stopstd,as_frequency,num_of_generations,num_of_energy_evals);
-		printf("Generations |  Evaluations |     Threshold    |  Average energy of best 10%%  | Samples |    Best energy\n");
-		printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
+		para_printf("\nExecuting docking runs, stopping automatically after either reaching %.2f kcal/mol standard deviation of\nthe best molecules of the last 4 * %u generations, %lu generations, or %lu evaluations:\n\n",stopstd,as_frequency,num_of_generations,num_of_energy_evals);
+		para_printf("Generations |  Evaluations |     Threshold    |  Average energy of best 10%%  | Samples |    Best energy\n");
+		para_printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
 	}
 
 	inline bool check_if_satisfactory(int generation_cnt, const float* energies, unsigned long total_evals)
@@ -181,7 +195,7 @@ class AutoStop{
 				delta_energy = 2.0 * thres_stddev / (Ntop-1);
 			}
 		}
-		printf("%11u | %12lu |%8.2f kcal/mol |%8.2f +/-%8.2f kcal/mol |%8i |%8.2f kcal/mol\n",generation_cnt,total_evals/num_of_runs,threshold_used,curr_avg,curr_std,bestN,overall_best_energy);
+		para_printf("%11u | %12lu |%8.2f kcal/mol |%8.2f +/-%8.2f kcal/mol |%8i |%8.2f kcal/mol\n",generation_cnt,total_evals/num_of_runs,threshold_used,curr_avg,curr_std,bestN,overall_best_energy);
 		fflush(stdout);
 		rolling[4*roll_count] = curr_avg * bestN;
 		rolling[4*roll_count+1] = (curr_std*curr_std + curr_avg*curr_avg)*bestN;
@@ -206,17 +220,17 @@ class AutoStop{
 
 	inline void output_final_stddev(int generation_cnt, const float* energies, unsigned long total_evals){
 		if (autostopped){
-			printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
-			printf("\n%43s evaluation after reaching\n%40.2f +/-%8.2f kcal/mol combined.\n%34i samples, best energy %8.2f kcal/mol.\n","Finished",average(&average_sd2_N[0]),stddev(&average_sd2_N[0]),(unsigned int)average_sd2_N[2],overall_best_energy);
+			para_printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
+			para_printf("\n%43s evaluation after reaching\n%40.2f +/-%8.2f kcal/mol combined.\n%34i samples, best energy %8.2f kcal/mol.\n","Finished",average(&average_sd2_N[0]),stddev(&average_sd2_N[0]),(unsigned int)average_sd2_N[2],overall_best_energy);
 		} else {
 			// Stopped without autostop; output stddev statistics regardless
 
 			tabulate_energies(energies);  // Fills average_sd2_N and overall_best_energy
 			set_stats(); // set curr_avg, curr_std, bestN, average_sd2_N
 
-			printf("%11u | %12lu |%8.2f kcal/mol |%8.2f +/-%8.2f kcal/mol |%8i |%8.2f kcal/mol\n",generation_cnt,total_evals/num_of_runs,threshold,curr_avg,curr_std,bestN,overall_best_energy);
-			printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
-			printf("\n%43s evaluation after reaching\n%33lu evaluations. Best energy %8.2f kcal/mol.\n","Finished",total_evals/num_of_runs,overall_best_energy);
+			para_printf("%11u | %12lu |%8.2f kcal/mol |%8.2f +/-%8.2f kcal/mol |%8i |%8.2f kcal/mol\n",generation_cnt,total_evals/num_of_runs,threshold,curr_avg,curr_std,bestN,overall_best_energy);
+			para_printf("------------+--------------+------------------+------------------------------+---------+-------------------\n");
+			para_printf("\n%43s evaluation after reaching\n%33lu evaluations. Best energy %8.2f kcal/mol.\n","Finished",total_evals/num_of_runs,overall_best_energy);
 		}
 		fflush(stdout);
 	}
diff --git a/host/inc/filelist.hpp b/host/inc/filelist.hpp
index 6ea8d073..c88fcdc0 100644
--- a/host/inc/filelist.hpp
+++ b/host/inc/filelist.hpp
@@ -34,25 +34,28 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 typedef struct _Dockpars Dockpars;
 
+typedef struct _fld_files_data{
+	std::string name;
+	size_t grid_idx;
+} fld_files_data;
+
 class FileList{
 	public:
-
-	bool                     used;
-	int                      nfiles;
-	bool                     preload_maps;
-	bool                     maps_are_loaded;
-	char*                    filename;
-	int                      max_len; // maximum length of strings in arrays below
-	std::vector<std::string> resnames;
-	std::vector<std::string> fld_files;
-	std::vector<std::string> ligand_files;
-	std::vector<Dockpars>    mypars;
-	std::vector<Gridinfo>    mygrids;
-	std::vector<bool>        load_maps_gpu; // indicate which device needs to still load maps from cpu
-
-	// Default to unused, with 1 file
-	FileList() : used( false ), nfiles( 1 ), preload_maps( true ), maps_are_loaded( false ), filename( NULL ), max_len ( 0 ) {}
-	~FileList(){ if(filename) free(filename); }
+		bool                        used;
+		int                         nfiles;
+		bool                        preload_maps;
+		bool                        maps_are_loaded;
+		char*                       filename;
+		std::vector<std::string>    resnames;
+		std::vector<fld_files_data> fld_files;
+		std::vector<std::string>    ligand_files;
+		std::vector<Dockpars>       mypars;
+		std::vector<Gridinfo>       mygrids;
+		std::vector<bool>           load_maps_gpu; // indicate which device needs to still load maps from cpu
+
+		// Default to unused, with 1 file
+		FileList() : used( false ), nfiles( 1 ), preload_maps( true ), maps_are_loaded( false ), filename( NULL ) {}
+		~FileList(){ if(filename) free(filename); }
 };
 
 #endif
diff --git a/host/inc/getparameters.h b/host/inc/getparameters.h
index 95c3f300..e8011247 100644
--- a/host/inc/getparameters.h
+++ b/host/inc/getparameters.h
@@ -65,8 +65,9 @@ constexpr AD4_free_energy_coeffs unbound_models[3] = {
 // Struct which contains the docking parameters (partly parameters for fpga)
 typedef struct _Dockpars
 {                                                                 // default values
-	int                       devnum                          = -1;
-	int                       devices_requested               = 1; // this is AD-GPU ...
+	int                       devnum                          = -1; // actual device number (-1 means not set, grab first)
+	std::vector<int>          dev_pool;
+	int                       dev_pool_nr                     = -1; // number in pool of many devices (-1 means none set, grab first)
 	uint32_t                  seed[3]                         = {(uint32_t)time(NULL),(uint32_t)processid(),0};
 	unsigned long             num_of_energy_evals             = 2500000;
 	unsigned long             num_of_generations              = 42000;
@@ -107,6 +108,8 @@ typedef struct _Dockpars
 	float                     H_cutoff                        = 3.7;
 	float                     V_cutoff                        = 4.0;
 	unsigned int              xml_files                       = 0;
+	unsigned int              filelist_files                  = 0;
+	unsigned int              filelist_grid_idx               = 0;
 	bool                      dlg2stdout                      = false;
 	int                       gen_pdbs                        = 0;
 	char*                     dpffile                         = NULL;
@@ -119,7 +122,6 @@ typedef struct _Dockpars
 	bool                      autostop                        = true;
 	unsigned int              as_frequency                    = 5;
 	float                     stopstd                         = 0.15;
-	bool                      cgmaps                          = false; // default is false (use a single map for every CGx or Gx atom type)
 	unsigned long             num_of_runs                     = 20;
 	unsigned int              list_nr                         = 0;
 	bool                      reflig_en_required              = false;
@@ -136,14 +138,27 @@ typedef struct _Dockpars
 	float                     adam_epsilon                    = 1.0e-8f;
 	bool                      output_dlg                      = true; // dlg output file will be generated (by default)
 	bool                      output_xml                      = true; // xml output file will be generated (by default)
+	bool                      calc_clustering                 = true; // wether clustering will be calculated and output
 } Dockpars;
 
-inline bool add_deriv_atype(
-                            Dockpars* mypars,
-                            char*     name,
-                            int       length
-                           )
+inline int add_deriv_atype(
+                           Dockpars* mypars,
+                           char*     name,
+                           int       length,
+                           bool      ignore_doublets = false
+                          )
 {
+	// name is too long
+	if(length>=4) return 0;
+	// make sure name hasn't already been used
+	for(int i=0; i<mypars->nr_deriv_atypes; i++){
+		if (strncmp(mypars->deriv_atypes[i].deriv_name, name, length) == 0){
+			if(ignore_doublets) return -i-1; // return doublet index starting from -1
+			printf("Error: -derivtype type name \"%s\" has already been used.\n",mypars->deriv_atypes[i].deriv_name);
+			exit(2);
+		}
+	}
+	// add new type name
 	mypars->nr_deriv_atypes++;
 	mypars->deriv_atypes=(deriv_atype*)realloc(mypars->deriv_atypes,mypars->nr_deriv_atypes*sizeof(deriv_atype));
 	if(mypars->deriv_atypes==NULL){
@@ -151,18 +166,9 @@ inline bool add_deriv_atype(
 		exit(1);
 	}
 	mypars->deriv_atypes[mypars->nr_deriv_atypes-1].nr=mypars->nr_deriv_atypes;
-	if(length<4){
-		strncpy(mypars->deriv_atypes[mypars->nr_deriv_atypes-1].deriv_name,name,length);
-		mypars->deriv_atypes[mypars->nr_deriv_atypes-1].deriv_name[length]='\0';
-	} else return false; // name is too long
-	// make sure name hasn't already been used
-	for(int i=0; i<mypars->nr_deriv_atypes-1; i++){
-		if (strcmp(mypars->deriv_atypes[i].deriv_name, mypars->deriv_atypes[mypars->nr_deriv_atypes-1].deriv_name) == 0){
-			printf("Error: -derivtype type name \"%s\" has already been used.\n",mypars->deriv_atypes[i].deriv_name);
-			exit(2);
-		}
-	}
-	return true;
+	strncpy(mypars->deriv_atypes[mypars->nr_deriv_atypes-1].deriv_name,name,length);
+	mypars->deriv_atypes[mypars->nr_deriv_atypes-1].deriv_name[length]='\0';
+	return 1;
 }
 
 bool argcmp(
@@ -171,13 +177,13 @@ bool argcmp(
             const char shortarg = '\0'
            );
 
-int preparse_dpf(
-                 const int*      argc,
-                       char**    argv,
-                       Dockpars* mypars,
-                       Gridinfo* mygrid,
-                       FileList& filelist
-                );
+int initial_commandpars(
+                        const int*      argc,
+                              char**    argv,
+                              Dockpars* mypars,
+                              Gridinfo* mygrid,
+                              FileList& filelist
+                       );
 
 int get_filelist(
                  const int*      argc,
@@ -191,7 +197,8 @@ int get_filenames_and_ADcoeffs(
                                const int*,
                                      char**,
                                      Dockpars*,
-                               const bool
+                               const bool,
+                               const bool = true
                               );
 
 void print_options(
diff --git a/host/inc/miscellaneous.h b/host/inc/miscellaneous.h
index 988f760e..ab3b64ff 100644
--- a/host/inc/miscellaneous.h
+++ b/host/inc/miscellaneous.h
@@ -31,13 +31,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <math.h>
 #include <stdlib.h>
 #include <ctype.h>
+#include <cstring>
 #include <limits>
 #include <cstdint>
+#include <string>
 
 #ifdef _WIN32
 #include <processthreadsapi.h>
 inline unsigned int processid() { return GetProcessId(); }
 #else
+// libgen.h contains basename() and dirname() from a fullpath name
+// Specific: to open correctly grid map field fiels and associated files
+// http://ask.systutorials.com/681/get-the-directory-path-and-file-name-from-absolute-path-linux
+#include <libgen.h>
 #include <unistd.h>
 inline unsigned int processid() { return getpid(); }
 #endif
@@ -46,6 +52,12 @@ inline unsigned int processid() { return getpid(); }
 
 #define PHI 0x9e3779b9
 
+#ifdef USE_PIPELINE
+#define para_printf(f_, ...) do { if(output==NULL){ printf((f_), ##__VA_ARGS__); } else { snprintf(outbuf, 256, (f_), ##__VA_ARGS__); *output += outbuf; } } while(false)
+#else
+#define para_printf(f_, ...) printf((f_), ##__VA_ARGS__)
+#endif
+
 typedef struct
 {
 	int  nr;            // this number starts at 1 and will be used to extend the base atom type nr
@@ -70,11 +82,7 @@ typedef struct
 	double z;
 } Quaternion;
 
-// macro that calculates the trilinear interpolation,
-// the first parameter is a 2*2*2 array of the values of the function
-// in the vertices of the cube,
-// and the second one is a 2*2*2 array of the interpolation weights
-#define trilin_interpol(cube, weights) (cube[0][0][0]*weights[0][0][0] +cube[1][0][0]*weights[1][0][0] +cube[0][1][0]*weights[0][1][0] +cube[1][1][0]*weights[1][1][0] +cube[0][0][1]*weights[0][0][1] +cube[1][0][1]*weights[1][0][1] +cube[0][1][1]*weights[0][1][1] +cube[1][1][1]*weights[1][1][1])
+float map2float(const char* c);
 
 int float2fracint(double, int);
 
@@ -84,10 +92,14 @@ long long float2fraclint(double, int);
 
 double distance(const double [], const double []);
 
+double distance2(const double [], const double []);
+
 void vec_point2line(const double [], const double [], const double [], double []);
 
 void rotate(double [], const double [], const double [], const double*, int);
 
+std::string get_filepath(const char* filename);
+
 #if 0
 // -------------------------------------------------------------------
 // Replacing rotation genes: from spherical space to Shoemake space
@@ -109,10 +121,6 @@ double angle_of_vectors(const double [], const double []);
 
 void vec_crossprod(const double [], const double [], double []);
 
-void get_trilininterpol_weights(double [][2][2], const double*, const double*, const double*);
-
-void get_trilininterpol_weights_f(float [][2][2], const float*, const float*, const float*);
-
 void print_binary_string(unsigned long long);
 
 #ifndef _WIN32
diff --git a/host/inc/performdocking.h.Cuda b/host/inc/performdocking.h.Cuda
index 99f628d5..dede1227 100644
--- a/host/inc/performdocking.h.Cuda
+++ b/host/inc/performdocking.h.Cuda
@@ -62,12 +62,7 @@ typedef struct {
 } Gradientparameters;
 #endif
 
-void copy_map_to_gpu(
-                     GpuTempData&      tData,
-                     std::vector<Map>& all_maps,
-                     int               t,
-                     int               size_of_one_map
-                    );
+std::vector<int> get_gpu_pool();
 
 void setup_gpu_for_docking(
                            GpuData&     cData,
@@ -81,7 +76,6 @@ void finish_gpu_from_docking(
 
 int docking_with_gpu(
                      const Gridinfo*        mygrid,
-                     /*const*/ float*       cpu_floatgrids,
                            Dockpars*        mypars,
                      const Liganddata*      myligand_init,
                      const Liganddata*      myxrayligand,
@@ -91,7 +85,7 @@ int docking_with_gpu(
                            SimulationState& sim_state,
                            GpuData&         cData,
                            GpuTempData&     tData,
-                           bool             floatgrids_preloaded
+                           std::string*     output = NULL
                     );
 
 double check_progress(
diff --git a/host/inc/performdocking.h.OpenCL b/host/inc/performdocking.h.OpenCL
index f70384fa..4da1c492 100644
--- a/host/inc/performdocking.h.OpenCL
+++ b/host/inc/performdocking.h.OpenCL
@@ -74,12 +74,7 @@ typedef struct {
 } Gradientparameters;
 #endif
 
-void copy_map_to_gpu(
-                     GpuTempData&      tData,
-                     std::vector<Map>& all_maps,
-                     int               t,
-                     int               size_of_one_map
-                    );
+std::vector<int> get_gpu_pool();
 
 void setup_gpu_for_docking(
                            GpuData&     cData,
@@ -93,7 +88,6 @@ void finish_gpu_from_docking(
 
 int docking_with_gpu(
                      const Gridinfo*        mygrid,
-                     /*const*/ float*       cpu_floatgrids,
                            Dockpars*        mypars,
                      const Liganddata*      myligand_init,
                      const Liganddata*      myxrayligand,
@@ -103,7 +97,7 @@ int docking_with_gpu(
                            SimulationState& sim_state,
                            GpuData&         cData,
                            GpuTempData&     tData,
-                           bool             floatgrids_preloaded
+                           std::string*     output = NULL
                     );
 
 double check_progress(
diff --git a/host/inc/processgrid.h b/host/inc/processgrid.h
index b4c51cd2..f484a22a 100644
--- a/host/inc/processgrid.h
+++ b/host/inc/processgrid.h
@@ -36,40 +36,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <fstream>
 #include <sstream>
 
-#ifndef _WIN32
-
-#include <libgen.h>
-
-// libgen.h contains basename() and dirname() from a fullpath name
-// Specific: to open correctly grid map field fiels and associated files
-// http://ask.systutorials.com/681/get-the-directory-path-and-file-name-from-absolute-path-linux
-#endif
-
 #include "defines.h"
 #include "miscellaneous.h"
 
-#define getvalue_4Darr(mempoi, grinf, t, z, y, x)                  *(mempoi + 4*((grinf).size_xyz[0] * (y + (grinf).size_xyz[1] * (z + (grinf).size_xyz[2]*t)) + x))
-#define getvalue_4Darr_withsize(mempoi, gridsize_xyz, t, z, y, x)  *(mempoi + 4*(gridsize_xyz[0]*(y + gridsize_xyz[1] * (z + gridsize_xyz[2]*t)) + x))
-//The macro helps to access the grid point values
-//which were read from external files with get_gridvalues function.
-//The first parameter is a pointer which points to the memory area storing the data.
-//The second one is the corresponding grid info (parameter of get_gridinfo function).
-//The other parameters are the type index, z, y and x coordinates of the grid point.
-
 // Struct containing all the important information coming from .gpf and .xyz files.
 typedef struct _Gridinfo
 {
-	char*  grid_file_path = NULL; // Added to store the full path of the grid file
-	char*  receptor_name  = NULL;
-	char*  map_base_name  = NULL;
-	int    size_xyz       [3];
-	double spacing;
-	double size_xyz_angstr[3];
-	char   grid_types     [MAX_NUM_OF_ATYPES+2][4]; // The additional two are the electrostatic and the desolvation types
-	int    num_of_atypes;
-	int    num_of_map_atypes;
-	double origo_real_xyz [3];
-	bool   info_read      = false; // so we don't have to continue reading the same information over and over again
+	std::string fld_name; // keep track of fld filename
+	std::string grid_file_path; // Added to store the full path of the grid file
+	std::string receptor_name;
+	std::string map_base_name;
+	int         size_xyz           [3];
+	double      spacing;
+	double      size_xyz_angstr    [3];
+	bool        fld_relative       = true; // By default (and until further notice) map file names are relative to the fld file
+	int         num_of_map_atypes;
+	double      origo_real_xyz     [3];
+	std::vector<std::string> grid_mapping; // stores the atom types and associated map filenames from the fld file
+	std::vector<float> grids;
 } Gridinfo;
 
 struct Map
@@ -80,20 +64,10 @@ struct Map
 };
 
 int get_gridinfo(
-                 const char*,
-                       Gridinfo*
+                 const char*     fldfilename,
+                       Gridinfo* mygrid
                 );
 
-int get_gridvalues_f(
-                     const Gridinfo* mygrid,
-                           float**   fgrids,
-                           bool      cgmaps
-                    );
-
-int get_gridvalues_f(
-                     const Gridinfo* mygrid,
-                           float*    fgrids,
-                           bool      cgmaps
-                    );
+int get_gridvalues(Gridinfo* mygrid);
 
 #endif /* PROCESSGRID_H_ */
diff --git a/host/inc/processligand.h b/host/inc/processligand.h
index a9c346eb..c5ebb705 100644
--- a/host/inc/processligand.h
+++ b/host/inc/processligand.h
@@ -41,7 +41,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
     }
 
 // Struct which contains ligand and flexres information.
-typedef struct
+typedef struct _Liganddata
 {
 // num_of_atoms:          Number of ligand/flexres atoms.
 	int            num_of_atoms;
@@ -67,6 +67,8 @@ typedef struct
 	char           base_atom_names       [MAX_NUM_OF_ATOMS][4];
 // atom_map_to_fgrids:    Maps each moving atom to a (pre-loaded) map id
 	int            atom_map_to_fgrids    [MAX_NUM_OF_ATOMS];
+// ligand grid types
+	char           ligand_grid_types     [MAX_NUM_OF_ATYPES+2][4]; // The additional two are the electrostatic and the desolvation types
 // atom_idxyzq:           Each row describes one atom of the ligand.
 //                        The columns (second index) contain the atom type code, x, y and z coordinate
 //                        (in Angstroms) and electrical charge  of the atom.
@@ -143,6 +145,9 @@ typedef struct
 	bool           acceptor                [MAX_NUM_OF_ATOMS];
 	bool           donor                   [MAX_NUM_OF_ATOMS];
 	bool           reactive                [MAX_NUM_OF_ATOMS]; // atoms with 1,4,7 numbered atom types
+// store the ligand file content so it only gets read once (not upto 4 times ...)
+	std::vector<std::string> file_content;
+	unsigned int ligand_line_count = 0;
 } Liganddata;
 
 // structure to store relevant receptor atom data
@@ -178,14 +183,14 @@ int init_liganddata(
                           Liganddata*,
                           Gridinfo*,
                           int          nr_deriv_atypes,
-                          deriv_atype* deriv_atypes,
-                          bool         cgmaps
+                          deriv_atype* deriv_atypes
                    );
 
 int set_liganddata_typeid(
-                                Liganddata*,
-                                int,
-                          const char*
+                                Liganddata* myligand,
+                                Gridinfo*   mygrid,
+                                int         atom_id,
+                          const char*       typeof_new_atom
                          );
 
 void get_intraE_contributors(Liganddata*);
@@ -211,17 +216,16 @@ int get_VWpars(
 
 int get_moving_and_unit_vectors(Liganddata*);
 
-int get_liganddata(
-                   const char*,
-                   const char*,
-                         Liganddata*,
-                   const double,
-                   const double,
-                         int          nr_deriv_atypes,
-                         deriv_atype* deriv_atypes,
-                         int          nr_mod_atype_pairs,
-                         pair_mod*    mod_atype_pairs
-                  );
+int parse_liganddata(
+                           Liganddata*  myligand,
+                           Gridinfo*    mygrid,
+                     const double       AD4_coeff_vdW,
+                     const double       AD4_coeff_hb,
+                           int          nr_deriv_atypes,
+                           deriv_atype* deriv_atypes,
+                           int          nr_mod_atype_pairs,
+                           pair_mod*    mod_atype_pairs
+                    );
 
 int gen_new_pdbfile(const char*, const char*, const Liganddata*);
 
@@ -233,7 +237,12 @@ void move_ligand(Liganddata*, const double [], const double []);
 
 void scale_ligand(Liganddata*, const double);
 
-double calc_rmsd(const Liganddata*, const Liganddata*, const bool);
+double calc_rmsd(
+                 const double       atom_idxyzq_ref [MAX_NUM_OF_ATOMS][5],
+                 const double       atom_idxyzq     [MAX_NUM_OF_ATOMS][5],
+                       unsigned int num_atoms,
+                 const bool         handle_symmetry
+                );
 
 double calc_ddd_Mehler_Solmajer(double);
 
@@ -244,19 +253,6 @@ bool is_H_bond(
                const char* atype2
               );
 
-void print_ref_lig_energies_f(
-                                    Liganddata,
-                              const float,
-                                    Gridinfo,
-                              const float*,
-                              const float,
-                              const float,
-                              const float,
-                              const float,
-                                    int,
-                                    pair_mod*
-                             );
-
 //////////////////////////////////
 //float functions
 
@@ -307,23 +303,14 @@ std::vector<AnalysisData> analyze_ligand_receptor(
 float calc_interE_f(
                     const Gridinfo*   mygrid,
                     const Liganddata* myligand,
-                    const float*      fgrids,
                           float       outofgrid_tolerance,
                           int         debug,
-                          float&      intraflexE
+                          float&      intraflexE,
+                          float*      elecE = NULL,
+                          float*      peratom_vdw = NULL,
+                          float*      peratom_elec = NULL
                    );
 
-void calc_interE_peratom_f(
-                           const Gridinfo*   mygrid,
-                           const Liganddata* myligand,
-                           const float*      fgrids,
-                                 float       outofgrid_tolerance,
-                                 float*      elecE,
-                                 float       peratom_vdw [MAX_NUM_OF_ATOMS],
-                                 float       peratom_elec[MAX_NUM_OF_ATOMS],
-                                 int         debug
-                          );
-
 struct IntraTables{
 	//The following tables will contain the 1/r^6, 1/r^10, 1/r^12, W_el/(r*eps(r)) and W_des*exp(-r^2/(2sigma^2)) functions for
 	//distances 0.01:0.01:20.48 A
@@ -339,21 +326,30 @@ struct IntraTables{
 	float q1q2          [MAX_NUM_OF_ATOMS][MAX_NUM_OF_ATOMS];
 	float qasp_mul_absq [MAX_NUM_OF_ATOMS];
 	bool is_HB          [MAX_NUM_OF_ATYPES] [MAX_NUM_OF_ATYPES];
+	pair_mod* mod_pair  [MAX_NUM_OF_ATYPES] [MAX_NUM_OF_ATYPES];
 
 	// Fill intraE tables
 	IntraTables(
 	            const Liganddata* myligand,
 	            const float       scaled_AD4_coeff_elec,
 	            const float       AD4_coeff_desolv,
-	            const float       qasp
+	            const float       qasp,
+	                  int         nr_mod_atype_pairs,
+	                  pair_mod*   mod_atype_pairs
 	           )
 	{
 		calc_distdep_tables_f(r_6_table, r_10_table, r_12_table, r_epsr_table, desolv_table, scaled_AD4_coeff_elec, AD4_coeff_desolv);
 		calc_q_tables_f(myligand, qasp, q1q2, qasp_mul_absq);
-		for (int type_id1=0; type_id1<myligand->num_of_atypes; type_id1++)
-			for (int type_id2=0; type_id2<myligand->num_of_atypes; type_id2++)
-				is_HB [type_id1][type_id2] = (is_H_bond(myligand->atom_types [type_id1],
-				                              myligand->atom_types [type_id2]) != 0);
+		for (int type_id1=0; type_id1<myligand->num_of_atypes; type_id1++){
+			for (int type_id2=0; type_id2<myligand->num_of_atypes; type_id2++){
+				is_HB    [type_id1][type_id2] = (is_H_bond(myligand->atom_types [type_id1],
+				                                           myligand->atom_types [type_id2]) != 0);
+				mod_pair [type_id1][type_id2] = is_mod_pair(myligand->atom_types[type_id1],
+				                                            myligand->atom_types[type_id2],
+				                                            nr_mod_atype_pairs,
+				                                            mod_atype_pairs);
+			}
+		}
 	}
 };
 
@@ -363,11 +359,9 @@ float calc_intraE_f(
                           float                     smooth,
                           bool                      ignore_desolv,
                     const float                     elec_min_distance,
-                          IntraTables&              tables,
+                          IntraTables*              tables,
                           int                       debug,
                           float&                    interflexE,
-                          int                       nr_mod_atype_pairs,
-                          pair_mod*                 mod_atype_pairs,
                           std::vector<AnalysisData> *analysis = NULL,
                     const ReceptorAtom*             flexres_atoms = NULL,
                           float                     R_cutoff = 2.1,
@@ -375,10 +369,4 @@ float calc_intraE_f(
                           float                     V_cutoff = 4.2
                    );
 
-int map_to_all_maps(
-                    Gridinfo*         mygrid,
-                    Liganddata*       myligand,
-                    std::vector<Map>& all_maps
-                   );
-
 #endif /* PROCESSLIGAND_H_ */
diff --git a/host/inc/processresult.h b/host/inc/processresult.h
index 0fec8a5a..b1e8c0b3 100644
--- a/host/inc/processresult.h
+++ b/host/inc/processresult.h
@@ -40,14 +40,14 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 typedef struct
 {
 	float*                    genotype; // a pointer here is sufficient and saves lots of memory copies
-	Liganddata                reslig_realcoord;
+	double                    atom_idxyzq         [MAX_NUM_OF_ATOMS][5]; // type id .. 0, x .. 1, y .. 2, z .. 3, q ... 4
 	float                     interE;
 	float                     interflexE;
 	float                     interE_elec;
 	float                     intraE;
 	float                     intraflexE;
-	float                     peratom_vdw  [MAX_NUM_OF_ATOMS];
-	float                     peratom_elec [MAX_NUM_OF_ATOMS];
+	float                     peratom_vdw         [MAX_NUM_OF_ATOMS];
+	float                     peratom_elec        [MAX_NUM_OF_ATOMS];
 	float                     rmsd_from_ref;
 	float                     rmsd_from_cluscent;
 	int                       clus_id;
@@ -65,7 +65,7 @@ void arrange_result(
 
 void write_basic_info(
                             FILE*       fp,
-                      const Liganddata* ligand_ref,
+                            Liganddata* ligand_ref,
                       const Dockpars*   mypars,
                       const Gridinfo*   mygrid,
                       const int*        argc,
@@ -74,7 +74,7 @@ void write_basic_info(
 
 void write_basic_info_dlg(
                                 FILE*       fp,
-                          const Liganddata* ligand_ref,
+                                Liganddata* ligand_ref,
                           const Dockpars*   mypars,
                           const Gridinfo*   mygrid,
                           const int*        argc,
@@ -84,14 +84,14 @@ void write_basic_info_dlg(
 void make_resfiles(
                          float*        final_population,
                          float*        energies,
-                   const Liganddata*   ligand_ref,
-                   const Liganddata*   ligand_from_pdb,
+                         IntraTables*  tables,
+                         Liganddata*   ligand_ref,
+                         Liganddata*   ligand_from_pdb,
                    const Liganddata*   ligand_xray,
                    const Dockpars*     mypars,
                          int           evals_performed,
                          int           generations_used,
                    const Gridinfo*     mygrid,
-                   const float*        grids,
                    const int*          argc,
                          char**        argv,
                          int           debug,
@@ -100,23 +100,23 @@ void make_resfiles(
                          Ligandresult* best_result
                   );
 
-void cluster_analysis(
-                            Ligandresult myresults [],
-                            int          num_of_runs,
-                            char*        report_file_name,
-                      const Liganddata*  ligand_ref,
-                      const Dockpars*    mypars,
-                      const Gridinfo*    mygrid,
-                      const int*         argc,
-                            char**       argv,
-                      const double       docking_avg_runtime,
-                      const double       program_runtime
-                     );
-
-void clusanal_gendlg(
+void ligand_calc_output(
+                              FILE*         fp,
+                        const char*         prefix,
+                              IntraTables*  tables,
+                        const Liganddata*   ligand,
+                        const Dockpars*     mypars,
+                        const Gridinfo*     mygrid,
+                              bool          output_analysis,
+                              bool          output_energy
+                       );
+
+void generate_output(
                            Ligandresult  myresults [],
                            int           num_of_runs,
-                     const Liganddata*   ligand_ref,
+                           IntraTables*  tables,
+                           Liganddata*   ligand_ref,
+                     const Liganddata*   ligand_xray,
                      const Dockpars*     mypars,
                      const Gridinfo*     mygrid,
                      const int*          argc,
@@ -130,9 +130,8 @@ void clusanal_gendlg(
 
 void process_result(
                     const Gridinfo*        mygrid,
-                    const float*           cpu_floatgrids,
                     const Dockpars*        mypars,
-                    const Liganddata*      myligand_init,
+                          Liganddata*      myligand_init,
                     const Liganddata*      myxrayligand,
                     const int*             argc,
                           char**           argv,
diff --git a/host/inc/setup.hpp b/host/inc/setup.hpp
index a61ee7a3..aca33d23 100644
--- a/host/inc/setup.hpp
+++ b/host/inc/setup.hpp
@@ -35,26 +35,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "processligand.h"
 #include "getparameters.h"
 
-int preload_gridsize(FileList& filelist);
-
-int load_all_maps(
-                  const char*             fldfilename,
-                  const Gridinfo*         mygrid,
-                        std::vector<Map>& all_maps,
-                        bool              cgmaps
-                 );
-
-int copy_from_all_maps(
-                       const Gridinfo*         mygrid,
-                             float*            fgrids,
-                             std::vector<Map>& all_maps
-                      );
+int preallocated_gridsize(FileList& filelist);
 
 int setup(
-          std::vector<Map>&   all_maps,
-          Gridinfo&           mygrid,
-          std::vector<float>& floatgrids,
-          Dockpars&           mypars,
+          Gridinfo*           mygrid,
+          Dockpars*           mypars,
           Liganddata&         myligand_init,
           Liganddata&         myxrayligand,
           FileList&           filelist,
diff --git a/host/src/calcenergy.cpp b/host/src/calcenergy.cpp
index a9d1b8e8..0c7b442f 100644
--- a/host/src/calcenergy.cpp
+++ b/host/src/calcenergy.cpp
@@ -149,7 +149,7 @@ int prepare_const_fields_for_gpu(
 
 	if (myligand_reference->num_of_intraE_contributors > MAX_INTRAE_CONTRIBUTORS)
 	{
-		printf("Error: number of intramolecular energy contributor is too high!\n");
+		printf("Error: Number of intramolecular energy contributor is larger than maximum (%d).\n",MAX_INTRAE_CONTRIBUTORS);
 		fflush(stdout);
 		return 1;
 	}
@@ -212,7 +212,7 @@ int prepare_const_fields_for_gpu(
 	// generate rotation list
 	if (gen_rotlist(myligand_reference, rotlist) != 0)
 	{
-		printf("Error: number of required rotations is too high!\n");
+		printf("Error: Number of required rotations is larger than maximum (%d).\n",MAX_NUM_OF_ROTATIONS);
 		return 1;
 	}
 
@@ -252,14 +252,14 @@ int prepare_const_fields_for_gpu(
 	}
 
 	for (i=0; i < myligand_reference->num_of_rotbonds; i++)
-	{	
+	{
 		// Pointing to the mem area corresponding to a given rotbond
 		intpoi = rotbonds_atoms + MAX_NUM_OF_ATOMS*i;
 
 		for (j=0; j < myligand_reference->num_of_atoms; j++)
 		{
 			/*
-			rotbonds_atoms [MAX_NUM_OF_ATOMS*i+j] = myligand_reference->atom_rotbonds [j][i]; // 
+			rotbonds_atoms [MAX_NUM_OF_ATOMS*i+j] = myligand_reference->atom_rotbonds [j][i];
 			*/
 			
 			// If an atom rotates with a rotbond, then
@@ -268,8 +268,8 @@ int prepare_const_fields_for_gpu(
 			if (myligand_reference->atom_rotbonds [j][i] == 1){
 				*intpoi = j;
 				intpoi++;
-				num_rotating_atoms_per_rotbond [i] ++;	
-			}	
+				num_rotating_atoms_per_rotbond [i] ++;
+			}
 
 		}
 	}
diff --git a/host/src/getparameters.cpp b/host/src/getparameters.cpp
index 51b27731..e05d0157 100644
--- a/host/src/getparameters.cpp
+++ b/host/src/getparameters.cpp
@@ -133,7 +133,8 @@ int dpf_token(const char* token)
 int parse_dpf(
               Dockpars* mypars,
               Gridinfo* mygrid,
-              FileList& filelist
+              FileList& filelist,
+              bool get_grid_info = true
              )
 {
 	if (mypars->dpffile)
@@ -158,14 +159,13 @@ int parse_dpf(
 		//   performance reasons (as they can be read once)
 		// - each ligand is still going to be limited to MAX_NUM_OF_ATYPES
 		char ltypes[4*MAX_NUM_OF_ATYPES][4];
-		char* typestr;
 		memset(ltypes,0,16*MAX_NUM_OF_ATYPES*sizeof(char));
-		unsigned int idx;
+		std::string map_fn;
+		int idx;
 		pair_mod* curr_pair;
 		float paramA, paramB;
 		int m, n;
 		char typeA[4], typeB[4];
-		filelist.max_len = 256;
 		bool new_device = false; // indicate if current mypars has a new device requested
 		unsigned int run_cnt=0;
 		while(std::getline(file, line)) {
@@ -174,6 +174,10 @@ int parse_dpf(
 			tempstr[0]='\0';
 			sscanf(line.c_str(),"%255s",tempstr);
 			int token_id = dpf_token(tempstr);
+			if (token_id >= DPF_MOVE ){ // take care of end-comments for regular tokens
+				int comment_loc = line.find("#");
+				if(comment_loc>0) line.erase(comment_loc,line.size()-comment_loc);
+			}
 			switch(token_id){
 				case DPF_MOVE: // movable ligand file name
 						if(!mypars->xml2dlg){
@@ -190,13 +194,12 @@ int parse_dpf(
 						}
 						break;
 				case DPF_FLD: // grid data file name
-						if(!mypars->xml2dlg){
+						if(get_grid_info){
 							sscanf(line.c_str(),"%*s %255s",argstr);
 							// Add the .fld file
 							if(mypars->fldfile) free(mypars->fldfile);
 							mypars->fldfile = strdup(argstr); // this allows using the dpf to set up all parameters but the ligand
 							// Filling mygrid according to the specified fld file
-							mygrid->info_read = false;
 							if (get_gridinfo(mypars->fldfile, mygrid) != 0)
 							{
 								printf("\nError: get_gridinfo failed with fld file specified with <%s> parameter at %s:%u.\n",tempstr,mypars->dpffile,line_count);
@@ -230,14 +233,28 @@ int parse_dpf(
 						mtype_nr=0;
 						break;
 				case DPF_MAP: // grid map specifier
-						sscanf(line.c_str(),"%*s %255s",argstr);
-						argstr[strlen(argstr)-4] = '\0'; // get rid of .map extension
-						typestr=strchr(argstr+strlen(argstr)-4,'.')+1; // 4 chars for atom type
 						if(mtype_nr>=ltype_nr){
 							printf("\nError: More map files specified than atom types at %s:%u (ligand types need to be specified before maps).\n",mypars->dpffile,line_count);
 							return 1;
 						}
-						if(strcmp(typestr,ltypes[mtype_nr])){ // derived type
+						sscanf(line.c_str(),"%*s %255s",argstr);
+						map_fn=argstr;
+						if(mygrid->grid_mapping.size()<2){
+							printf("Error: fld keyword needs to be placed before <%s> parameter at %s:%u.\n",tempstr,mypars->dpffile,line_count);
+							return 1;
+						}
+						n=-1;
+						for(m=mygrid->grid_mapping.size()/2; m<mygrid->grid_mapping.size(); m++)
+							if(map_fn.find(mygrid->grid_mapping[m])!=std::string::npos){
+								n=m-mygrid->grid_mapping.size()/2;
+								break;
+							}
+						if(n<0){
+							printf("Error: No matching map file <%s> specified in fld file at %s:%u.\n",argstr,mypars->dpffile,line_count);
+							return 1;
+						}
+						strcpy(argstr,mygrid->grid_mapping[n].c_str());
+						if(strcmp(argstr,ltypes[mtype_nr])){ // derived type
 							if(mypars->nr_deriv_atypes==0){ // get the derived atom types started
 								mypars->deriv_atypes=(deriv_atype*)malloc(sizeof(deriv_atype));
 								if(mypars->deriv_atypes==NULL){
@@ -245,15 +262,24 @@ int parse_dpf(
 									return 1;
 								}
 							}
-							if(!add_deriv_atype(mypars,ltypes[mtype_nr],strlen(ltypes[mtype_nr]))){
+							idx = add_deriv_atype(mypars,ltypes[mtype_nr],strlen(ltypes[mtype_nr]),true);
+							if(idx == 0){
 								printf("Error: Derivative (ligand type %s) names can only be upto 3 characters long.\n",ltypes[mtype_nr]);
 								return 1;
 							}
-							idx = mypars->nr_deriv_atypes-1;
-							strcpy(mypars->deriv_atypes[idx].base_name,typestr);
+							if(idx>0){
+								idx = mypars->nr_deriv_atypes-1;
+								strcpy(mypars->deriv_atypes[idx].base_name,argstr);
 #ifdef DERIVTYPE_INFO
-							printf("%i: %s=%s\n",mypars->deriv_atypes[idx].nr,mypars->deriv_atypes[idx].deriv_name,mypars->deriv_atypes[idx].base_name);
+								printf("%i: %s=%s\n",mypars->deriv_atypes[idx].nr,mypars->deriv_atypes[idx].deriv_name,mypars->deriv_atypes[idx].base_name);
 #endif
+							} else{ // same derived type name - make sure base type is the same
+								idx++; // idx is -type idx-1
+								if(strcmp(mypars->deriv_atypes[-idx].base_name,argstr)){
+									printf("Error: Redefinition of ligand type %s with different map types.\n",ltypes[mtype_nr]);
+									return 1;
+								}
+							}
 						}
 						mtype_nr++;
 						break;
@@ -329,8 +355,19 @@ int parse_dpf(
 									printf("\nError: No map file on record yet. Please specify a map file before the first ligand.\n");
 									return 1;
 								}
-								filelist.fld_files.push_back(mypars->fldfile);
+								if(filelist.fld_files.size()>0){
+									// only add to fld_files if different from previous one
+									if(strcmp(mypars->fldfile,filelist.fld_files.back().name.c_str())!=0){
+										filelist.fld_files.push_back({mypars->fldfile,filelist.mygrids.size()});
+										// Also add the grid
+										filelist.mygrids.push_back(*mygrid);
+									}
+								} else{
+									filelist.fld_files.push_back({mypars->fldfile,filelist.mygrids.size()});
+									filelist.mygrids.push_back(*mygrid);
+								}
 								mypars->list_nr++;
+								mypars->filelist_grid_idx=filelist.fld_files.back().grid_idx;
 								// If more than one unique protein, cant do map preloading yet
 								if (filelist.fld_files.size()>1){
 									filelist.preload_maps=false;
@@ -346,7 +383,10 @@ int parse_dpf(
 									mypars->resname[len]='\0';
 								} else mypars->resname = strdup("docking"); // Fallback to old default
 								filelist.resnames.push_back(mypars->resname);
-								if(new_device) mypars->devices_requested++;
+								if(new_device){
+									mypars->dev_pool_nr=mypars->dev_pool.size();
+									mypars->dev_pool.push_back(mypars->devnum);
+								}
 								// Before pushing parameters and grids back make sure
 								// the filename pointers are unique
 								if(filelist.mypars.size()>0){ // mypars and mygrids have same size
@@ -356,20 +396,9 @@ int parse_dpf(
 									if((filelist.mypars.back().xrayligandfile) &&
 									   (filelist.mypars.back().xrayligandfile==mypars->xrayligandfile))
 										mypars->xrayligandfile=strdup(mypars->xrayligandfile);
-									if((filelist.mygrids.back().grid_file_path) &&
-									   (filelist.mygrids.back().grid_file_path==mygrid->grid_file_path))
-										mygrid->grid_file_path=strdup(mygrid->grid_file_path);
-									if((filelist.mygrids.back().receptor_name) &&
-									   (filelist.mygrids.back().receptor_name==mygrid->receptor_name))
-										mygrid->receptor_name=strdup(mygrid->receptor_name);
-									if((filelist.mygrids.back().map_base_name) &&
-									   (filelist.mygrids.back().map_base_name==mygrid->map_base_name))
-										mygrid->map_base_name=strdup(mygrid->map_base_name);
 								}
 								// Add the parameter block now that resname is set
 								filelist.mypars.push_back(*mypars);
-								// Also add the grid
-								filelist.mygrids.push_back(*mygrid);
 							}
 						} else{
 							if(token_id!=DPF_RUNS) run_cnt++;
@@ -512,10 +541,12 @@ int parse_dpf(
 							}
 							// count GPUs in case we set a different one
 							if(argcmp("devnum",tempstr,'D')){
-								new_device=false;
-								for(i=0; (i<(int)filelist.mypars.size())&&!new_device; i++){
-									if(mypars->devnum==filelist.mypars[i].devnum){
-										new_device=true;
+								new_device=true;
+								for(i=0; i<mypars->dev_pool.size(); i++){
+									if(mypars->devnum==mypars->dev_pool[i]){
+										new_device=false;
+										mypars->dev_pool_nr=i;
+										break;
 									}
 								}
 							}
@@ -535,13 +566,13 @@ int parse_dpf(
 	return 0;
 }
 
-int preparse_dpf(
-                 const int*      argc,
-                       char**    argv,
-                       Dockpars* mypars,
-                       Gridinfo* mygrid,
-                       FileList& filelist
-                )
+int initial_commandpars(
+                        const int*      argc,
+                              char**    argv,
+                              Dockpars* mypars,
+                              Gridinfo* mygrid,
+                              FileList& filelist
+                       )
 // This function checks if a dpf file is used and, if runs are specified, map and ligand information
 // is stored in the filelist; flexres information and which location in the dpf parameters are in each
 // run is stored separately to allow logical parsing with the correct parameters initialized per run
@@ -549,6 +580,10 @@ int preparse_dpf(
 	bool output_multiple_warning = true;
 	std::vector<std::string> xml_files;
 	bool read_more_xml_files = false;
+#ifdef TOOLMODE
+	mypars->xml2dlg = true;
+	read_more_xml_files = true;
+#endif
 	int error;
 	for (int i=1; i<(*argc)-1+(read_more_xml_files); i++)
 	{
@@ -573,11 +608,16 @@ int preparse_dpf(
 				}
 			}
 			mypars->dpffile = strdup(argv[i+1]);
+			i++;
 		}
 		
 		// Argument: load initial data from xml file and reconstruct dlg, then finish
 		if (argcmp("xml2dlg", argv [i], 'X'))
 		{
+			if(mypars->xml_files>0){
+				printf("Error: Only one --xml2dlg (-X) argument is allowed.\n");
+				return 1;
+			}
 			mypars->load_xml = strdup(argv[i+1]);
 			read_more_xml_files = true;
 			mypars->xml2dlg = true;
@@ -603,6 +643,7 @@ int preparse_dpf(
 				sscanf(argv[i+1], "%f,%f,%f", &(mypars->R_cutoff), &(mypars->H_cutoff), &(mypars->V_cutoff));
 				mypars->contact_analysis = true;
 			}
+			i++;
 		}
 		
 		// Argument: print dlg output to stdout instead of to a file
@@ -613,7 +654,11 @@ int preparse_dpf(
 				mypars->dlg2stdout = false;
 			else
 				mypars->dlg2stdout = true;
+			i++;
 		}
+#ifdef TOOLMODE
+		read_more_xml_files = true;
+#endif
 	}
 	
 	bool specified_dpf = (mypars->dpffile!=NULL);
@@ -658,21 +703,30 @@ int preparse_dpf(
 			                   mypars->flexresfile,
 			                   mypars->list_nr,
 			                   mypars->seed);
-			if(!specified_dpf){ // parse dpf file in XML file unless user specified one
-				if((error=parse_dpf(mypars,mygrid,filelist))) return error;
-			}
-			mypars->pop_size=1;
+
 			// Filling mygrid according to the specified fld file
-			mygrid->info_read = false;
 			if (get_gridinfo(mypars->fldfile, mygrid) != 0)
 			{
 				printf("\nError: get_gridinfo failed with fld file (%s) specified in %s.\n",mypars->fldfile,mypars->load_xml);
 				return 1;
 			}
-			if(prev_fld_file){ // unfortunately, some strcmp implementation segfault with NULL as input
-				if(strcmp(prev_fld_file,mypars->fldfile) != 0)
-					filelist.fld_files.push_back(mypars->fldfile);
-			} else filelist.fld_files.push_back(mypars->fldfile);
+
+			if(!specified_dpf){ // parse dpf file in XML file unless user specified one
+				if((error=parse_dpf(mypars,mygrid,filelist,false))) return error;
+			}
+			mypars->pop_size=1;
+
+			if(filelist.fld_files.size()>0){
+				// only add to fld_files if different from previous one
+				if(strcmp(mypars->fldfile,filelist.fld_files.back().name.c_str())!=0){
+					filelist.fld_files.push_back({mypars->fldfile,filelist.mygrids.size()});
+					filelist.mygrids.push_back(*mygrid);
+				}
+			} else{
+				filelist.fld_files.push_back({mypars->fldfile,filelist.mygrids.size()});
+				filelist.mygrids.push_back(*mygrid);
+			}
+			mypars->filelist_grid_idx=filelist.fld_files.back().grid_idx;
 
 			// If more than one unique protein, cant do map preloading yet
 			if (filelist.fld_files.size()>1)
@@ -681,20 +735,23 @@ int preparse_dpf(
 			// Add the ligand filename in the xml to the filelist
 			filelist.ligand_files.push_back(mypars->ligandfile);
 			filelist.mypars.push_back(*mypars);
-			filelist.mygrids.push_back(*mygrid);
 		}
 		if(mypars->xml_files>100) printf("\n\n");
 		filelist.nfiles = mypars->xml_files;
 	} else{
+#ifdef TOOLMODE
+		printf("Error: No xml files specified.\n\n");
+		print_options(argv[0]);
+		return 1;
+#endif
 		filelist.nfiles = filelist.ligand_files.size();
 	}
 	if(filelist.nfiles>0){
 		filelist.used = true;
 		if(mypars->contact_analysis && filelist.preload_maps){
 			std::string receptor_name=mygrid->grid_file_path;
-			if(strlen(mygrid->grid_file_path)>0) receptor_name+="/";
-			receptor_name += mygrid->receptor_name;
-			receptor_name += ".pdbqt";
+			if(mygrid->grid_file_path.size()>0) receptor_name+="/";
+			receptor_name += mygrid->receptor_name + ".pdbqt";
 			mypars->receptor_atoms = read_receptor(receptor_name.c_str(),mygrid,mypars->receptor_map,mypars->receptor_map_list);
 			mypars->nr_receptor_atoms = mypars->receptor_atoms.size();
 		}
@@ -716,102 +773,180 @@ int get_filelist(
 		filelist.preload_maps&=filelist.used;
 		return 0;
 	}
-	bool output_multiple_warning = true;
-	for (int i=1; i<(*argc)-1; i++)
+	bool read_ligands = false;
+	std::vector<char*> ligands;
+	for (int i=1; i<(*argc)-1+(read_ligands); i+=1+(!read_ligands))
 	{
+		// wildcards for -filelist are allowed (or multiple file names)
+		// - one file specified is the filelist containing file
+		// - more than one file will be multiple ligands
+		// the test below is to stop reading arguments as filenames when another argument starts with "-"
+		if (read_ligands && (argv[i][0]=='-')){
+			read_ligands = false;
+			if(i>=(*argc)-1) break; // ignore last argument if there is no parameter specified
+		} else if (read_ligands) ligands.push_back(argv[i]); // copy argument into xml_files when read_more_xml_files is true
+		
+		if (argcmp("xml2dlg", argv[i], 'X'))
+			i+=mypars->xml_files-1; // skip ahead in case there are multiple entries here
+		
 		// Argument: file name that contains list of files.
 		if (argcmp("filelist", argv[i], 'B'))
 		{
-			filelist.used = true;
+			if(ligands.size()>0){
+				printf("Error: Only one --filelist (-B) argument is allowed.\n");
+				return 1;
+			}
 			if(filelist.filename){
 				free(filelist.filename);
-				if(output_multiple_warning){
-					printf("Warning: Multiple --filelist (-B) arguments, only the last one will be used.");
-					output_multiple_warning = false;
-				}
+				filelist.filename = NULL;
 			}
-			filelist.filename = strdup(argv[i+1]);
+			read_ligands=true;
 		}
 	}
+	mypars->filelist_files = ligands.size();
+	if(ligands.size()>1){
+		// Need to setup file names from command line in case they weren't set with a dpf
+		if (get_filenames_and_ADcoeffs(argc, argv, mypars, filelist.used, false) != 0){
+			return 1;
+		}
+		// use current (aka last specified) fld file to for this file list
+		if(mypars->fldfile==NULL){
+			printf("Error: Argument --filelist (-B) with ligand files needs a grid file. Please specify through --ffile (-M) or --import_dpf (-I).\n");
+			return 1;
+		}
+		filelist.fld_files.push_back({mypars->fldfile,filelist.mygrids.size()});
+		mypars->filelist_grid_idx=filelist.fld_files.back().grid_idx;
+		// Filling mygrid according to the specified fld file
+		if (get_gridinfo(mypars->fldfile, mygrid) != 0)
+		{
+			printf("Error: get_gridinfo failed with fld file specified in file list.\n");
+			return 1;
+		}
+		// Add the grid info
+		filelist.mygrids.push_back(*mygrid);
+		for(unsigned int i=0; i<ligands.size(); i++){
+			// Need new mypars->fldfile char* block to preserve previous one
+			if(filelist.mypars.size()>0){
+				if((filelist.mypars.back().fldfile) &&
+				   (filelist.mypars.back().fldfile==mypars->fldfile))
+					mypars->fldfile=strdup(mypars->fldfile);
+				if((filelist.mypars.back().flexresfile) &&
+				   (filelist.mypars.back().flexresfile==mypars->flexresfile))
+					mypars->flexresfile=strdup(mypars->flexresfile);
+				if((filelist.mypars.back().xrayligandfile) &&
+				   (filelist.mypars.back().xrayligandfile==mypars->xrayligandfile))
+					mypars->xrayligandfile=strdup(mypars->xrayligandfile);
+			}
+			mypars->ligandfile = strdup(ligands[i]);
+			filelist.ligand_files.push_back(ligands[i]);
+			mypars->list_nr++;
+			long long len = strrchr(ligands[i],'.')-ligands[i];
+			if(len<1) len=strlen(ligands[i]);
+			filelist.resnames.push_back(filelist.ligand_files[i].substr(0,len));
+			mypars->resname=strdup(filelist.resnames[i].c_str());
+			// Add the parameter block
+			filelist.mypars.push_back(*mypars);
+		}
+		filelist.nfiles = filelist.ligand_files.size();
+		if(filelist.nfiles>0) filelist.used = true;
+		
+		filelist.preload_maps&=filelist.used;
+		if(mypars->contact_analysis && filelist.preload_maps){
+			std::string receptor_name=mygrid->grid_file_path;
+			if(mygrid->grid_file_path.size()>0) receptor_name+="/";
+			receptor_name += mygrid->receptor_name + ".pdbqt";
+			mypars->receptor_atoms = read_receptor(receptor_name.c_str(),mygrid,mypars->receptor_map,mypars->receptor_map_list);
+			mypars->nr_receptor_atoms = mypars->receptor_atoms.size();
+		}
+		return 0;
+	} else if(ligands.size()==1) filelist.filename = strdup(ligands[0]);
 
 	if (filelist.filename){ // true when -filelist specifies a filename
 	                        // filelist.used may be true when dpf file is specified as it uses the filelist to store runs
 		std::ifstream file(filelist.filename);
 		if(file.fail()){
-			printf("\nError: Could not open filelist %s. Check path and permissions.\n",filelist.filename);
+			printf("\nError: Could not open file list %s. Check path and permissions.\n",filelist.filename);
 			return 1;
 		}
 		std::string line;
 		bool prev_line_was_fld=false;
+		int prev_fld_line;
 		unsigned int initial_res_count = filelist.resnames.size();
 		int len;
+		int last_fld_idx=0;
 		int line_count=0;
 		while(std::getline(file, line)) {
 			line_count++;
 			trim(line); // Remove leading and trailing whitespace
 			len = line.size();
-			if(len>filelist.max_len) filelist.max_len = len;
 			if (len>=4 && line.compare(len-4,4,".fld") == 0){
+				bool new_grid=true;
 				if (prev_line_was_fld){ // Overwrite the previous fld file if two in a row
-					filelist.fld_files[filelist.fld_files.size()-1] = line;
-					printf("\nWarning: using second listed .fld file in line %d\n",line_count);
+					filelist.mygrids.pop_back(); // previous map is invalid now and will be overwritten by new one
+					filelist.fld_files.back() = {line,filelist.mygrids.size()};
+					printf("Warning: Fld file specified in line %d of the file list is superceded by line %d.\n\n",prev_fld_line,line_count);
 				} else {
-					// Add the .fld file
-					filelist.fld_files.push_back(line);
+					// Add the fld file if different from previous
+					if(filelist.fld_files.size()>0){
+						new_grid=false;
+						if(line.compare(filelist.fld_files.back().name)!=0){
+							filelist.fld_files.push_back({line,filelist.mygrids.size()});
+							new_grid=true;
+						}
+					} else filelist.fld_files.push_back({line,filelist.mygrids.size()});
 					prev_line_was_fld=true;
 
 					// If more than one unique protein, cant do map preloading yet
-					if (filelist.fld_files.size()>1){
+					if (filelist.fld_files.size()>0){
 						filelist.preload_maps=false;
 					}
 				}
+				mypars->filelist_grid_idx = filelist.fld_files.back().grid_idx;
+				prev_fld_line=line_count;
+				// Keep mypars->fldfile current (need new char* block to preserve previous one)
+				mypars->fldfile = strdup(filelist.fld_files.back().name.c_str());
 				// Filling mygrid according to the specified fld file
-				mygrid->info_read = false;
-				if (get_gridinfo(filelist.fld_files[filelist.fld_files.size()-1].c_str(), mygrid) != 0)
+				if (get_gridinfo(mypars->fldfile, mygrid) != 0)
 				{
-					printf("\nError: get_gridinfo failed with fld file specified in filelist.\n");
+					printf("Error: get_gridinfo failed with fld file specified in file list.\n");
 					return 1;
 				}
+				// Add the grid info
+				if(new_grid) filelist.mygrids.push_back(*mygrid);
+			} else if (len>=7 && line.compare(len-7,7,".pdbqt*") == 0){
+				// Add the reference (xray) ligand file
+				mypars->xrayligandfile = strndup(line.c_str(),len-1);
+				mypars->given_xrayligandfile = true;
 			} else if (len>=6 && line.compare(len-6,6,".pdbqt") == 0){
 				// Add the .pdbqt
 				filelist.ligand_files.push_back(line);
 				mypars->list_nr++;
 				// Before pushing parameters and grids back make sure
-				// the filename pointers are unique
-				if(filelist.mypars.size()>0){ // mypars and mygrids have same size
+				// the filename pointers are unique in the filelist
+				if(filelist.mypars.size()>0){
+					if((filelist.mypars.back().fldfile) &&
+					   (filelist.mypars.back().fldfile==mypars->fldfile))
+						mypars->fldfile=strdup(mypars->fldfile);
 					if((filelist.mypars.back().flexresfile) &&
 					   (filelist.mypars.back().flexresfile==mypars->flexresfile))
 						mypars->flexresfile=strdup(mypars->flexresfile);
 					if((filelist.mypars.back().xrayligandfile) &&
 					   (filelist.mypars.back().xrayligandfile==mypars->xrayligandfile))
 						mypars->xrayligandfile=strdup(mypars->xrayligandfile);
-					if((filelist.mygrids.back().grid_file_path) &&
-					   (filelist.mygrids.back().grid_file_path==mygrid->grid_file_path))
-						mygrid->grid_file_path=strdup(mygrid->grid_file_path);
-					if((filelist.mygrids.back().receptor_name) &&
-					   (filelist.mygrids.back().receptor_name==mygrid->receptor_name))
-						mygrid->receptor_name=strdup(mygrid->receptor_name);
-					if((filelist.mygrids.back().map_base_name) &&
-					   (filelist.mygrids.back().map_base_name==mygrid->map_base_name))
-						mygrid->map_base_name=strdup(mygrid->map_base_name);
 				}
-				// Add the parameter block
-				filelist.mypars.push_back(*mypars);
-				// Add the grid info
-				filelist.mygrids.push_back(*mygrid);
+				// Keep track of fld files
 				if (filelist.fld_files.size()==0){
-					if(mygrid->info_read){ // already read a map file in with dpf import
-						printf("\nUsing map file from dpf import.\n");
-						filelist.fld_files.push_back(mypars->fldfile);
+					if(mygrid->fld_name.length()>0){ // already read a map file in with dpf import
+						printf("Using map file from dpf import.\n\n");
 					} else{
-						printf("\nError: No map file on record yet. Please specify a .fld file before the first ligand (%s).\n",line.c_str());
+						printf("Error: No map file on record yet. Please specify a .fld file before the first ligand (%s).\n",line.c_str());
 						return 1;
 					}
 				}
-				if (filelist.ligand_files.size()>filelist.fld_files.size()){
-					// If this ligand doesnt have a protein preceding it, use the previous protein
-					filelist.fld_files.push_back(filelist.fld_files[filelist.fld_files.size()-1]);
-				}
+				// Add the parameter block
+				filelist.mypars.push_back(*mypars);
+				// Keep track of fld lines actually used
+				last_fld_idx = filelist.fld_files.size();
 				prev_line_was_fld=false;
 			} else if (len>0) {
 				// Anything else in the file is assumed to be the resname
@@ -820,23 +955,26 @@ int get_filelist(
 		}
 
 		filelist.nfiles = filelist.ligand_files.size();
+		if(filelist.nfiles>0) filelist.used = true;
 
 		if (filelist.ligand_files.size()==0){
-			printf("\nError: No ligands, through lines ending with the .pdbqt suffix, have been specified.\n");
+			printf("Error: No ligands, through lines ending with the .pdbqt suffix, have been specified.\n");
 			return 1;
 		}
 		if (filelist.ligand_files.size() != filelist.resnames.size()){
 			if(filelist.resnames.size()-initial_res_count>0){ // make sure correct number of resnames were specified when they were specified
-				printf("\nError: Inconsistent number of resnames (%lu) compared to ligands (%lu)!\n",filelist.resnames.size(),filelist.ligand_files.size());
+				printf("Error: Inconsistent number of resnames (%lu) compared to ligands (%lu)!\n",filelist.resnames.size(),filelist.ligand_files.size());
 				return 1;
 			} else{ // otherwise add default resname (ligand basename)
-				for(unsigned int i=filelist.resnames.size(); i<filelist.ligand_files.size(); i++)
-					filelist.resnames.push_back(filelist.ligand_files[i].substr(0,filelist.ligand_files[i].size()-6));
+				for(unsigned int i=filelist.resnames.size(); i<filelist.ligand_files.size(); i++){
+					const char* ln = filelist.ligand_files[i].c_str();
+					long long len = strrchr(ln,'.')-ln;
+					if(len<1) len=strlen(ln);
+					filelist.resnames.push_back(filelist.ligand_files[i].substr(0,len));
+				}
 			}
 		}
 		for(unsigned int i=initial_res_count; i<filelist.ligand_files.size(); i++){
-			if(filelist.mypars[i].fldfile) free(filelist.mypars[i].fldfile);
-			filelist.mypars[i].fldfile = strdup(filelist.fld_files[i].c_str());
 			if(filelist.mypars[i].ligandfile) free(filelist.mypars[i].ligandfile);
 			filelist.mypars[i].ligandfile = strdup(filelist.ligand_files[i].c_str());
 			if(filelist.mypars[i].resname) free(filelist.mypars[i].resname);
@@ -844,6 +982,13 @@ int get_filelist(
 		}
 	}
 	filelist.preload_maps&=filelist.used;
+	if(mypars->contact_analysis && filelist.preload_maps){
+		std::string receptor_name=mygrid->grid_file_path;
+		if(mygrid->grid_file_path.size()>0) receptor_name+="/";
+		receptor_name += mygrid->receptor_name + ".pdbqt";
+		mypars->receptor_atoms = read_receptor(receptor_name.c_str(),mygrid,mypars->receptor_map,mypars->receptor_map_list);
+		mypars->nr_receptor_atoms = mypars->receptor_atoms.size();
+	}
 
 	return 0;
 }
@@ -852,7 +997,8 @@ int get_filenames_and_ADcoeffs(
                                const int*      argc,
                                      char**    argv,
                                      Dockpars* mypars,
-                               const bool      multiple_files
+                               const bool      multiple_files,
+                               const bool      missing_error
                               )
 // The function fills the file name and coeffs fields of mypars parameter
 // according to the proper command line arguments.
@@ -864,8 +1010,14 @@ int get_filenames_and_ADcoeffs(
 	ffile_given = (mypars->fldfile!=NULL);
 	lfile_given = (mypars->ligandfile!=NULL);
 	
-	for (i=1; i<(*argc)-1; i++)
+	for (i=1; i<(*argc)-1; i+=2)
 	{
+		if (argcmp("filelist", argv[i], 'B'))
+			i+=mypars->filelist_files-1; // skip ahead in case there are multiple entries here
+		
+		if (argcmp("xml2dlg", argv[i], 'X'))
+			i+=mypars->xml_files-1; // skip ahead in case there are multiple entries here
+		
 		if (!multiple_files){
 			// Argument: grid parameter file name.
 			if (argcmp("ffile", argv[i], 'M'))
@@ -914,18 +1066,16 @@ int get_filenames_and_ADcoeffs(
 		}
 	}
 
-	if (ffile_given == 0 && !multiple_files)
+	if (ffile_given == 0 && !multiple_files && missing_error)
 	{
 		printf("Error: grid fld file was not defined. Use --ffile (-M) argument!\n");
 		print_options(argv[0]);
-		return 1; // we'll never get here - but we might in the future again ...
 	}
 
-	if (lfile_given == 0 && !multiple_files)
+	if (lfile_given == 0 && !multiple_files && missing_error)
 	{
 		printf("Error: ligand pdbqt file was not defined. Use --lfile (-L) argument!\n");
 		print_options(argv[0]);
-		return 1; // we'll never get here - but we might in the future again ...
 	}
 
 	return 0;
@@ -936,30 +1086,42 @@ void print_options(
                   )
 {
 	printf("Command line options:\n\n");
-	printf(" Argument              | Description                                           | Default value\n");
-	printf("-----------------------|-------------------------------------------------------|------------------\n");
+	printf("Arguments              | Description                                           | Default value\n");
+	printf("-----------------------+-------------------------------------------------------+------------------\n");
+#ifndef TOOLMODE
+	printf("\nINPUT\n");
 	printf("--lfile             -L | Ligand pdbqt file                                     | no default\n");
 	printf("--ffile             -M | Grid map files descriptor fld file                    | no default\n");
 	printf("--flexres           -F | Flexible residue pdbqt file                           | no default\n");
 	printf("--filelist          -B | Batch file                                            | no default\n");
 	printf("--import_dpf        -I | Import AD4-type dpf input file (only partial support) | no default\n");
-	printf("--resnam            -N | Name for docking output log                           | ligand basename\n");
 	printf("--xraylfile         -R | reference ligand file for RMSD analysis               | ligand file\n");
-	printf("--devnum            -D | OpenCL/Cuda device number (counting starts at 1)      | 1\n");
-	printf("--derivtype         -T | Derivative atom types (e.g. C1,C2,C3=C/S4=S/H5=HD)    | no default\n");
-	printf("--modpair           -P | Modify vdW pair params (e.g. C1:S4,1.60,1.200,13,7)   | no default\n");
-	printf("--heuristics        -H | Ligand-based automatic search method and # evals      | 1 (yes)\n");
-	printf("--heurmax           -E | Asymptotic heuristics # evals limit (smooth limit)    | 12000000\n");
-	printf("--autostop          -A | Automatic stopping criterion based on convergence     | 1 (yes)\n");
-	printf("--asfreq            -a | AutoStop testing frequency (in # of generations)      | 5\n");
-	printf("--contact_analysis  -C | Perform distance-based analysis (description below)   | 0 (no)\n");
+	printf("\nCONVERSION\n");
 	printf("--xml2dlg           -X | One (or many) AD-GPU xml file(s) to convert to dlg(s) | no default\n");
+#endif
+	printf("\nOUTPUT\n");
+	printf("--resnam            -N | Name for docking output log                           | ligand basename\n");
+	printf("--contact_analysis  -C | Perform distance-based analysis (description below)   | 0 (no)\n");
 	printf("--xmloutput         -x | Specify if xml output format is wanted                | 1 (yes)\n");
-	printf("--loadxml           -c | Load initial population from xml results file         | no default\n");
 	printf("--dlgoutput         -d | Control if dlg output is created                      | 1 (yes)\n");
 	printf("--dlg2stdout        -2 | Write dlg file output to stdout (if not OVERLAP=ON)   | 0 (no)\n");
+	printf("--rlige                | Print reference ligand energies                       | 0 (no)\n");
+	printf("--gfpop                | Output all poses from all populations of each LGA run | 0 (no)\n");
+	printf("--npdb                 | # pose pdbqt files from populations of each LGA run   | 0\n");
+	printf("--gbest                | Output single best pose as pdbqt file                 | 0 (no)\n");
+	printf("--clustering           | Output clustering analysis in dlg and/or xml file     | 1 (yes)\n");
+	printf("--hsym                 | Handle symmetry in RMSD calc.                         | 1 (yes)\n");
+	printf("--rmstol               | RMSD clustering tolerance                             | 2 (Å)\n");
+#ifndef TOOLMODE
+	printf("\nSETUP\n");
+	printf("--devnum            -D | OpenCL/Cuda device number (counting starts at 1)      | 1\n");
+	printf("--loadxml           -c | Load initial population from xml results file         | no default\n");
 	printf("--seed              -s | Random number seeds (up to three comma-sep. integers) | time, process id\n");
-	printf("--ubmod             -u | Unbound model: 0 (bound), 1 (extended), 2 (compact)   | 0 (same as bound)\n");
+	printf("\nSEARCH\n");
+	printf("--heuristics        -H | Ligand-based automatic search method and # evals      | 1 (yes)\n");
+	printf("--heurmax           -E | Asymptotic heuristics # evals limit (smooth limit)    | 12000000\n");
+	printf("--autostop          -A | Automatic stopping criterion based on convergence     | 1 (yes)\n");
+	printf("--asfreq            -a | AutoStop testing frequency (in # of generations)      | 5\n");
 	printf("--nrun              -n | # LGA runs                                            | 20\n");
 	printf("--nev               -e | # Score evaluations (max.) per LGA run                | 2500000\n");
 	printf("--ngen              -g | # Generations (max.) per LGA run                      | 42000\n");
@@ -970,39 +1132,50 @@ void print_options(
 	printf("--crat                 | Crossover rate                                        | 80  (%%)\n");
 	printf("--lsrat                | Local-search rate                                     | 100 (%%)\n");
 	printf("--trat                 | Tournament (selection) rate                           | 60  (%%)\n");
-	printf("--rlige                | Print reference ligand energies                       | 0 (no)\n");
-	printf("--hsym                 | Handle symmetry in RMSD calc.                         | 1 (yes)\n");
-	printf("--rmstol               | RMSD clustering tolerance                             | 2 (Å)\n");
 	printf("--dmov                 | Maximum LGA movement delta                            | 6 (Å)\n");
 	printf("--dang                 | Maximum LGA angle delta                               | 90 (°)\n");
 	printf("--rholb                | Solis-Wets lower bound of rho parameter               | 0.01\n");
 	printf("--lsmov                | Solis-Wets movement delta                             | 2 (Å)\n");
 	printf("--lsang                | Solis-Wets angle delta                                | 75 (°)\n");
 	printf("--cslim                | Solis-Wets cons. success/failure limit to adjust rho  | 4\n");
+	printf("--stopstd              | AutoStop energy standard deviation tolerance          | 0.15 (kcal/mol)\n");
+	printf("--initswgens           | Initial # generations of Solis-Wets instead of -lsmet | 0 (no)\n");
+#endif
+	printf("\nSCORING\n");
+	printf("--derivtype         -T | Derivative atom types (e.g. C1,C2,C3=C/S4=S/H5=HD)    | no default\n");
+	printf("--modpair           -P | Modify vdW pair params (e.g. C1:S4,1.60,1.200,13,7)   | no default\n");
+	printf("--ubmod             -u | Unbound model: 0 (bound), 1 (extended), 2 (compact)   | 0 (same as bound)\n");
 	printf("--smooth               | Smoothing parameter for vdW interactions              | 0.5 (Å)\n");
 	printf("--elecmindist          | Min. electrostatic potential distance (w/ dpf: 0.5 Å) | 0.01 (Å)\n");
 	printf("--modqp                | Use modified QASP from VirtualDrug or AD4 original    | 0 (no, use AD4)\n");
-	printf("--cgmaps               | Use individual maps for CG-G0 instead of the same one | 0 (no, same map)\n");
-	printf("--stopstd              | AutoStop energy standard deviation tolerance          | 0.15 (kcal/mol)\n");
-	printf("--initswgens           | Initial # generations of Solis-Wets instead of -lsmet | 0 (no)\n");
-	printf("--gfpop                | Output all poses from all populations of each LGA run | 0 (no)\n");
-	printf("--npdb                 | # pose pdbqt files from populations of each LGA run   | 0\n");
-	printf("--gbest                | Output single best pose as pdbqt file                 | 0 (no)\n");
-
-	printf("\nAutodock-GPU requires a ligand and a set of grid maps as well as optionally a flexible residue to\n");
-	printf("perform a docking calculation. These could be specified directly (--lfile, --ffile, and --flexres),\n");
-	printf("as part of a filelist text file (see README.md for format), or as an AD4-style dpf.\n");
-
+#ifndef TOOLMODE
+	printf("\nAutoDock-GPU requires a ligand and a set of grid maps to perform a docking calculation. Optionally,\n");
+	printf("one or multiple flexible residues may be provided. These inputs could be specified directly (--lfile,\n");
+	printf("--ffile, and --flexres), as part of a file list text file (see README.md), or in an AD4-style dpf.\n");
+#endif
 	printf("\nExamples:\n");
+#ifndef TOOLMODE
 	printf("   * Dock ligand.pdbqt to receptor.maps.fld using 50 LGA runs:\n");
 	printf("        %s --lfile ligand.pdbqt --ffile receptor.maps.fld --nrun 50\n",program_name);
+#endif
+	printf("   * Convert all xml files to their respective dlg and perform contact analysis:\n");
+#ifndef TOOLMODE
+	printf("        %s --xml2dlg *.xml --contact_analysis 1\n",program_name);
+#else
+	printf("        %s --contact_analysis 1 *.xml\n",program_name);
+#endif
 	printf("   * Convert ligand.xml to dlg, perform contact analysis, and output dlg to stdout:\n");
+#ifndef TOOLMODE
 	printf("        %s --xml2dlg ligand.xml --contact_analysis 1 --dlg2stdout 1\n",program_name);
+#else
+	printf("        %s -C 1 -2 1 ligand.xml\n",program_name);
+#endif
+#ifndef TOOLMODE
 	printf("   * Dock ligands and map specified in file.lst with flexres flex.pdbqt:\n");
 	printf("        %s --filelist file.lst --flexres flex.pdbqt\n",program_name);
 	printf("   * Dock ligands, map, and (optional) flexres specified in docking.dpf on device #2:\n");
 	printf("        %s --import_dpf docking.dpf --devnum 2\n\n",program_name);
-	
+#endif
 	exit(0);
 }
 
@@ -1047,7 +1220,8 @@ int get_commandpars(
 		// default values
 		mypars->abs_max_dmov        = 6.0/(*spacing);             // +/-6A
 		mypars->base_dmov_mul_sqrt3 = 2.0/(*spacing)*sqrt(3.0);   // 2 A
-		mypars->xrayligandfile      = strdup(mypars->ligandfile); // By default xray-ligand file is the same as the randomized input ligand
+		if(mypars->xrayligandfile==NULL)
+			mypars->xrayligandfile      = strdup(mypars->ligandfile); // By default xray-ligand file is the same as the randomized input ligand
 		if(mypars->xml2dlg){
 			if(strlen(mypars->load_xml)>4){ // .xml = 4 chars
 				i=strlen(mypars->load_xml)-4;
@@ -1069,10 +1243,16 @@ int get_commandpars(
 	}
 
 	// overwriting values which were defined as a command line argument
+#ifndef TOOLMODE
 	for (i=1; i<(*argc)-1; i+=2)
+#else
+	for (i=1; i<(*argc); i++)
+#endif
 	{
 		arg_recognized = 0;
-
+#ifdef TOOLMODE
+		if(argv[i][0]!='-') arg_recognized=1;
+#endif
 		// Argument: number of energy evaluations. Must be a positive integer.
 		if (argcmp("nev", argv[i], 'e'))
 		{
@@ -1433,11 +1613,13 @@ int get_commandpars(
 		// UPDATED in : get_filelist()
 		// ---------------------------------
 		// Argument: name of file containing file list
-		if (argcmp("filelist", argv [i], 'B'))
+		if (argcmp("filelist", argv [i], 'B')){
 			arg_recognized = 1;
+			i+=mypars->filelist_files-1; // skip ahead in case there are multiple entries here
+		}
 
 		// ---------------------------------
-		// UPDATED in : preparse_dpf()
+		// UPDATED in : initial_commandpars()
 		// ---------------------------------
 		// Argument: name of file containing file list
 		if (argcmp("import_dpf", argv [i], 'I'))
@@ -1500,8 +1682,12 @@ int get_commandpars(
 		{
 			arg_recognized = 1;
 			arg_set = 0;
-			if(!late_call){
+			if(!late_call){ // this means when this is called during dpf file reading
 				arg_set = 1;
+				if(strchr(argv[i+1], ',')){ // only allowed from command line
+					printf("Error: Value of --devnum (-D) is expected to be a single device number here.\n");
+					return -1;
+				}
 				sscanf(argv [i+1], "%d", &tempint);
 				if ((tempint >= 1) && (tempint <= 65536)){
 					mypars->devnum = (unsigned long) tempint-1;
@@ -1513,17 +1699,6 @@ int get_commandpars(
 		}
 		// ----------------------------------
 
-		// ----------------------------------
-		// Argument: Multiple CG-G0 maps or not
-		// - has already been tested for in
-		//   main.cpp, as it's needed at grid
-		//   creation time not after (now)
-		if (argcmp("cgmaps", argv [i]))
-		{
-			arg_recognized = 1; // stub to not complain about an unknown parameter
-		}
-		// ----------------------------------
-
 		// ----------------------------------
 		// Argument: Automatic stopping criterion (1) or not (0)
 		if (argcmp("autostop", argv [i], 'A'))
@@ -1732,6 +1907,20 @@ int get_commandpars(
 				mypars->output_xml = true;
 		}
 
+		// Argument: choose wether to calculate and output clustering
+		// If the value is 1, DLG output will be generated
+		// DLG output won't be generated if 0 is specified
+		if (argcmp("clustering", argv [i]))
+		{
+			arg_recognized = 1;
+			sscanf(argv [i+1], "%d", &tempint);
+			
+			if (tempint == 0)
+				mypars->calc_clustering = false;
+			else
+				mypars->calc_clustering = true;
+		}
+
 		// ----------------------------------
 		// Argument: ligand xray pdbqt file name
 		if (argcmp("xraylfile", argv[i], 'R'))
@@ -1740,7 +1929,6 @@ int get_commandpars(
 			free(mypars->xrayligandfile);
 			mypars->xrayligandfile = strdup(argv[i+1]);
 			mypars->given_xrayligandfile = true;
-			printf("Info: using --xraylfile (-R) value as X-ray ligand.\n");
 		}
 		// ----------------------------------
 
@@ -1749,6 +1937,9 @@ int get_commandpars(
 			print_options(argv[0]);
 			return -1; // we won't get here - maybe we will in the future though ...
 		}
+#ifdef TOOLMODE
+		else i++;
+#endif
 	}
 
 	// validating some settings
@@ -1791,8 +1982,11 @@ std::vector<ReceptorAtom> read_receptor_atoms(
 		sscanf(line.c_str(),"%255s",tempstr);
 		if ((strcmp(tempstr, "HETATM") == 0) || (strcmp(tempstr, "ATOM") == 0))
 		{
+			line.insert(38,1,' '); // add spaces to make reading coordinates easier
+			line.insert(47,1,' ');
 			sscanf(&line.c_str()[30], "%f %f %f", &(current.x), &(current.y), &(current.z));
-			sscanf(&line.c_str()[77], "%3s", current.atom_type);
+			// moved by the two spaces above
+			sscanf(&line.c_str()[79], "%3s", current.atom_type);
 			line[27]='\0';
 			sscanf(line.c_str(), "%*s %d %4s %3s %1s %d", &(current.id), current.name, current.res_name, current.chain_id, &(current.res_id));
 			// assign H-bond acceptors (is going to fail for flexres with modified atom types)
@@ -1804,7 +1998,6 @@ std::vector<ReceptorAtom> read_receptor_atoms(
 			if((heavy=='O') || (heavy=='N') || (heavy=='S')) heavy_ids.push_back(atoms.size());
 			atoms.push_back(current);
 		}
-		if(strcmp(tempstr, "TER") == 0) break;
 	}
 	ReceptorAtom heavy, HD;
 	// assign H-donor heavy atoms
diff --git a/host/src/main.cpp b/host/src/main.cpp
index 531b3017..35a9915b 100644
--- a/host/src/main.cpp
+++ b/host/src/main.cpp
@@ -33,13 +33,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 #include "processgrid.h"
 #include "processligand.h"
+#include "processresult.h"
 #include "getparameters.h"
+
+#ifndef TOOLMODE
 #include "performdocking.h"
+#endif
+
 #include "filelist.hpp"
 #include "setup.hpp"
 #include "profile.hpp"
 #include "simulation_state.hpp"
+
+#ifndef TOOLMODE
 #include "GpuData.h"
+#endif
 
 #ifndef _WIN32
 // Time measurement
@@ -74,7 +82,7 @@ int main(int argc, char* argv[])
 	printf("AutoDock-GPU version: %s\n\n", VERSION);
 	// Print help screen if no parameters were specified
 	// (or if last parameter is "-help"; parameters in
-	//  between will be caught in preparse_dpf later)
+	//  between will be caught in initial_commandpars later)
 	if((argc<2) || (argcmp("help", argv[argc-1], 'h')))
 		print_options(argv[0]);
 	// Timer initializations
@@ -94,7 +102,7 @@ int main(int argc, char* argv[])
 	FileList filelist;
 	Dockpars initial_pars;
 	Gridinfo initial_grid;
-	if (preparse_dpf(&argc, argv, &initial_pars, &initial_grid, filelist) != 0)
+	if (initial_commandpars(&argc, argv, &initial_pars, &initial_grid, filelist) != 0)
 		return 1;
 	if (get_filelist(&argc, argv, &initial_pars, &initial_grid, filelist) != 0)
 		return 1;
@@ -115,71 +123,118 @@ int main(int argc, char* argv[])
 			filelist.mypars[i].dlg2stdout = false;
 	}
 #endif
-	if(initial_pars.xml2dlg){
-		if(initial_pars.contact_analysis)
-			printf("Analyzing ");
-		else
-			printf("Converting ");
-		printf("%d xml file",n_files);
-		if(n_files>1) printf("s");
-		if(initial_pars.contact_analysis)
-			printf(" (contact analysis cutoffs: R=%.1f Å, H=%.1f Å, V=%.1f Å)\n", initial_pars.R_cutoff, initial_pars.H_cutoff, initial_pars.V_cutoff);
-		else
-			printf(" to dlg\n");
-	} else{
-		printf("Running %d docking calculation",n_files);
-		if(n_files>1) printf("s");
-		if(initial_pars.contact_analysis)
-			printf(" (contact analysis cutoffs: R=%.1f Å, H=%.1f Å, V=%.1f Å)\n", initial_pars.R_cutoff, initial_pars.H_cutoff, initial_pars.V_cutoff);
-		else
-			printf("\n");
+#ifdef TOOLMODE
+	if(!initial_pars.xml2dlg){
+		printf("Error: Code has been compiled without GPU support and only supports xml2dlg mode.\n");
+		exit(-1);
 	}
-	printf("\n");
-	int pl_gridsize = preload_gridsize(filelist);
-
-	// Setup master map set (one for now, nthreads-1 for general case)
-	std::vector<Map> all_maps;
-
+	int nr_devices=0;
+#else
 	int devnum=-1;
-	int nr_devices=initial_pars.devices_requested;
 	// Get device number to run on
 	for (int i=1; i<argc-1; i+=2)
 	{
+		if (argcmp("filelist", argv[i], 'B'))
+			i+=initial_pars.filelist_files-1; // skip ahead in case there are multiple entries here
+		
 		if (argcmp("xml2dlg", argv[i], 'X'))
 			i+=initial_pars.xml_files-1; // skip ahead in case there are multiple entries here
 		
 		if (argcmp("devnum", argv [i], 'D'))
 		{
-			unsigned int tempint;
-			sscanf(argv [i+1], "%d", &tempint);
-			if ((tempint >= 1) && (tempint <= 65536)){
-				devnum = (unsigned long) tempint-1;
+			if(stricmp(argv[i+1],"all")==0){
+				initial_pars.dev_pool.clear();
+				initial_pars.dev_pool = get_gpu_pool();
+				devnum = -1;
+			} else if(stricmp(argv[i+1],"auto")==0){
+				initial_pars.dev_pool.clear();
+				devnum = -1;
 			} else{
-				printf("Error: Value of --devnum (-D) argument must be an integer between 1 and 65536.\n");
-				exit(-1);
+				initial_pars.dev_pool.clear();
+				unsigned int tempint;
+				char* val=argv[i+1];
+				bool multiple=false;
+				do{
+					sscanf(val, "%d", &tempint);
+					if ((tempint >= 1) && (tempint <= 65536)){
+						devnum = (unsigned long) tempint-1;
+					} else{
+						printf("Error: Value(s) of --devnum (-D) argument must be an integer between 1 and 65536 (examples: -D 2 or -D 1,3,5).\n");
+						exit(-1);
+					}
+					val=strchr(val,','); // find next entry
+					if(val){
+						val++; // move past the comma
+						multiple=true;
+					}
+					if(multiple) initial_pars.dev_pool.push_back(devnum);
+				} while(val);
+				if(multiple) devnum=-1; // needed to automatically load the right values from the pool
 			}
-			break;
 		}
 	}
+	int nr_devices=initial_pars.dev_pool.size();
 	if(devnum>=0){ // user-specified argument on command line has precedence
-		if(initial_pars.devices_requested>1)
-			printf("Using (single GPU) --devnum (-D) specified as command line option.\n");
+		if(initial_pars.dev_pool.size()>1)
+			printf("Using (single GPU) --devnum (-D) specified as command line option.\n\n");
 		nr_devices=1;
+		initial_pars.dev_pool.clear();
 	} else devnum=initial_pars.devnum;
 
-	if(nr_devices<1) nr_devices=1;
+	if(nr_devices<1){
+		nr_devices=1;
+		initial_pars.dev_pool.clear();
+	}
 #ifndef USE_PIPELINE
-	if(nr_devices>1) printf("Info: Parallelization over multiple GPUs is only available if OVERLAP=ON is specified when AD-GPU is build.\n");
+	if(nr_devices>1) printf("Info: Parallelization over multiple GPUs is only available if OVERLAP=ON is specified when AD-GPU is build.\n\n");
+#endif
 #endif
+	if(initial_pars.xml2dlg){
+		if(initial_pars.contact_analysis)
+			printf("Analyzing ");
+		else
+			printf("Converting ");
+		printf("%d xml file",n_files);
+		if(n_files>1) printf("s");
+		if(initial_pars.contact_analysis)
+			printf(" (contact analysis cutoffs: R=%.1f Å, H=%.1f Å, V=%.1f Å)\n", initial_pars.R_cutoff, initial_pars.H_cutoff, initial_pars.V_cutoff);
+		else
+			printf(" to dlg\n");
+	} else{
+		printf("Running %d docking calculation",n_files);
+		if(n_files>1){
+			printf("s");
+			if(nr_devices>1) printf(" on %d devices",std::min(n_files,nr_devices));
+		}
+		if(initial_pars.contact_analysis)
+			printf(" (contact analysis cutoffs: R=%.1f Å, H=%.1f Å, V=%.1f Å)\n", initial_pars.R_cutoff, initial_pars.H_cutoff, initial_pars.V_cutoff);
+		else
+			printf("\n");
+	}
+	printf("\n");
+	int max_preallocated_gridsize = preallocated_gridsize(filelist);
+
+#ifndef TOOLMODE
 	// Objects that are arguments of docking_with_gpu
 	GpuData cData[nr_devices];
 	GpuTempData tData[nr_devices];
-	
+#ifdef USE_PIPELINE
+	omp_lock_t gpu_locks[nr_devices];
+#endif
 	for(int i=0; i<nr_devices; i++){
 		filelist.load_maps_gpu.push_back(true);
+		if(initial_pars.dev_pool.size()>0)
+			cData[i].devnum=initial_pars.dev_pool[i];
+		else
+			cData[i].devnum=devnum;
+		cData[i].preallocated_gridsize = max_preallocated_gridsize;
 		tData[i].pMem_fgrids=NULL; // in case setup fails this is needed to make sure we don't segfault trying to deallocate it
+		tData[i].device_busy=false;
+#ifdef USE_PIPELINE
+		omp_init_lock(&gpu_locks[i]);
+#endif
 	}
-	
+#endif
 	// Set up run profiles for timing
 	bool get_profiles = true; // hard-coded switch to use ALS's job profiler
 	Profiler profiler;
@@ -191,14 +246,10 @@ int main(int argc, char* argv[])
 	// Error flag for each ligand
 	std::vector<int> err(n_files,0);
 
-	if(!initial_pars.xml2dlg){
-		if(nr_devices==1){
-			cData[0].devnum = devnum;
-			cData[0].preload_gridsize = pl_gridsize;
-			setup_gpu_for_docking(cData[0],tData[0]);
-		}
-	}
-
+#ifndef TOOLMODE
+	if(!initial_pars.xml2dlg && (nr_devices==1))
+		setup_gpu_for_docking(cData[0],tData[0]);
+#endif
 	total_setup_time+=seconds_since(time_start);
 	
 	if(initial_pars.xml2dlg && !initial_pars.dlg2stdout){
@@ -211,15 +262,15 @@ int main(int argc, char* argv[])
 #ifdef USE_PIPELINE
 	#pragma omp parallel
 	{
+		char outbuf[256];
 		int t_id = omp_get_thread_num();
 #else
 	{
 #endif
 		Dockpars   mypars = initial_pars;
 		Liganddata myligand_init;
-		Gridinfo   mygrid = initial_grid;
+		Gridinfo*  mygrid = &initial_grid;
 		Liganddata myxrayligand;
-		std::vector<float> floatgrids;
 		SimulationState sim_state;
 		int dev_nr = 0;
 #ifndef _WIN32
@@ -234,7 +285,7 @@ int main(int argc, char* argv[])
 			// Setup the next file in the queue
 			if(filelist.used){
 				mypars = filelist.mypars[i_job];
-				mygrid = filelist.mygrids[i_job];
+				mygrid = &filelist.mygrids[mypars.filelist_grid_idx];
 			}
 			if(mypars.contact_analysis){
 				if(filelist.preload_maps){ // use preloaded data for receptor
@@ -249,60 +300,60 @@ int main(int argc, char* argv[])
 					if((50*(i_job+1)) % n_files < 50){
 						printf("*"); fflush(stdout);
 					}
-			} else{
+			}
+#ifndef TOOLMODE
+			else
+			{
 #ifdef USE_PIPELINE
 				printf ("(Thread %d is setting up Job #%d)\n",t_id,i_job+1); fflush(stdout);
 				#pragma omp critical
 #endif
 				{
 					if(nr_devices>1){
-						dev_nr = mypars.devices_requested-1;
-						if(cData[dev_nr].devnum>-2){
-							cData[dev_nr].devnum = mypars.devnum;
-							cData[dev_nr].preload_gridsize = pl_gridsize;
-							setup_gpu_for_docking(cData[dev_nr],tData[dev_nr]);
-						}
+						if(mypars.dev_pool_nr<0){ // assign next available GPU
+							dev_nr=-1;
+							for(unsigned int i=0; i<nr_devices; i++){
+								if(!tData[i].device_busy){ // found an available GPU
+									dev_nr=i;
+									break;
+								}
+							}
+							// if no GPU is available, assign one based on the job nr
+							if(dev_nr<0) dev_nr = i_job % nr_devices;
+						} else dev_nr = mypars.dev_pool_nr; // this is set when specific GPU is requested
+						tData[dev_nr].device_busy = true;
+						setup_gpu_for_docking(cData[dev_nr],tData[dev_nr]);
+						fflush(stdout);
 					}
 				}
 			}
+#endif
 			start_timer(setup_timer);
 			// Load files, read inputs, prepare arrays for docking stage
-			if (setup(all_maps, mygrid, floatgrids, mypars, myligand_init, myxrayligand, filelist, i_job, argc, argv) != 0) {
+			if (setup(mygrid, &mypars, myligand_init, myxrayligand, filelist, i_job, argc, argv) != 0) {
 				// If error encountered: Set error flag to 1; Add to count of finished jobs
 				// Keep in setup stage rather than moving to launch stage so a different job will be set up
-				printf("\n\nError in setup of Job #%d", i_job+1);
-				if (filelist.used){
-					printf(":\n");
-					printf("(   Grid map file: %s )\n",  mypars.fldfile);
-					printf("(   Ligand file: %s )\n", mypars.ligandfile); fflush(stdout);
-					if(mypars.flexresfile)
-						printf("(   Flexible residue: %s )\n", mypars.flexresfile);
-					fflush(stdout);
-				} else printf("\n");
+#ifdef USE_PIPELINE
+				#pragma omp critical
+#endif
+				{
+					printf("\nError in setup of Job #%d", i_job+1);
+					if (filelist.used){
+						printf(":\n");
+						printf("(   Grid map file: %s )\n",  mypars.fldfile);
+						printf("(   Ligand file: %s )\n", mypars.ligandfile); fflush(stdout);
+						if(mypars.flexresfile)
+							printf("(   Flexible residue: %s )\n", mypars.flexresfile);
+						fflush(stdout);
+					} else printf("\n");
+				}
 				err[i_job] = 1;
 				continue;
 			} else { // Successful setup
 #ifdef USE_PIPELINE
 				#pragma omp atomic update
 #endif
-				total_setup_time+=seconds_since(setup_timer); // can't count waiting to enter the critical section -AT
-				// Copy preloaded maps to GPU
-				if(!mypars.xml2dlg){
-#ifdef USE_PIPELINE
-					#pragma omp critical
-#endif
-					{
-						start_timer(setup_timer);
-						if(filelist.preload_maps && filelist.load_maps_gpu[dev_nr]){
-							int size_of_one_map = 4*mygrid.size_xyz[0]*mygrid.size_xyz[1]*mygrid.size_xyz[2];
-							for (unsigned int t=0; t < all_maps.size(); t++){
-								copy_map_to_gpu(tData[dev_nr],all_maps,t,size_of_one_map);
-							}
-							filelist.load_maps_gpu[dev_nr]=false;
-						}
-						total_setup_time+=seconds_since(setup_timer);
-					}
-				}
+				total_setup_time+=seconds_since(setup_timer);
 			}
 			
 			// Starting Docking or loading results
@@ -311,7 +362,7 @@ int main(int argc, char* argv[])
 				// allocating CPU memory for initial populations
 				mypars.output_xml = false;
 				int nrot;
-				sim_state.cpu_populations = read_xml_genomes(mypars.load_xml, mygrid.spacing, nrot, true);
+				sim_state.cpu_populations = read_xml_genomes(mypars.load_xml, mygrid->spacing, nrot, true);
 				if(nrot!=myligand_init.num_of_rotbonds){
 					printf("\nError: XML genome contains %d rotatable bonds but current ligand has %d.\n",nrot,myligand_init.num_of_rotbonds);
 					exit(2);
@@ -321,9 +372,9 @@ int main(int argc, char* argv[])
 				get_movvec_to_origo(&(sim_state.myligand_reference), movvec_to_origo);
 				double flex_vec[3];
 				for (unsigned int i=0; i<3; i++)
-					flex_vec [i] = -mygrid.origo_real_xyz [i];
+					flex_vec [i] = -mygrid->origo_real_xyz [i];
 				move_ligand(&(sim_state.myligand_reference), movvec_to_origo, flex_vec);
-				scale_ligand(&(sim_state.myligand_reference), 1.0/mygrid.spacing);
+				scale_ligand(&(sim_state.myligand_reference), 1.0/mygrid->spacing);
 				get_moving_and_unit_vectors(&(sim_state.myligand_reference));
 				mypars.pop_size = 1;
 				mypars.num_of_runs = sim_state.cpu_populations.size()/GENOTYPE_LENGTH_IN_GLOBMEM;
@@ -335,36 +386,45 @@ int main(int argc, char* argv[])
 				sim_state.cpu_evals_of_runs.resize(size_evals_of_runs);
 				memset(sim_state.cpu_evals_of_runs.data(), 0, size_evals_of_runs);
 				total_setup_time+=seconds_since(setup_timer);
-			} else{
+				sim_state.idle_time = 0.0;
+				sim_state.exec_time = 0.0;
+
+			}
+#ifndef TOOLMODE
+			else
+			{
 				int error_in_docking;
-				// Critical section to only let one thread access GPU at a time
+				// Lock to only let one thread access a given GPU at a time
+				std::string* output = NULL;
 #ifdef USE_PIPELINE
-				#pragma omp critical
+				omp_set_lock(&gpu_locks[dev_nr]);
+				if(nr_devices>1) output = new std::string;
+#endif
+				para_printf("\nRunning Job #%d", i_job+1);
+				if (filelist.used){
+					para_printf(":\n");
+					para_printf("    Device: %s\n", tData[dev_nr].device_name);
+					para_printf("    Grid map file: %s\n",  mypars.fldfile);
+					para_printf("    Ligand file: %s\n", mypars.ligandfile); fflush(stdout);
+					if(mypars.flexresfile)
+						para_printf("    Flexible residue: %s\n", mypars.flexresfile);
+					fflush(stdout);
+				} else para_printf("\n");
+				// End idling timer, start exec timer
+				sim_state.idle_time = seconds_since(idle_timer);
+				start_timer(exec_timer);
+				// Dock
+				error_in_docking = docking_with_gpu(mygrid, &(mypars), &(myligand_init), &(myxrayligand), profiler.p[(get_profiles ? i_job : 0)], &argc, argv, sim_state, cData[dev_nr], tData[dev_nr], output);
+				// End exec timer, start idling timer
+				sim_state.exec_time = seconds_since(exec_timer);
+				start_timer(idle_timer);
+#ifdef USE_PIPELINE
+				omp_unset_lock(&gpu_locks[dev_nr]);
 #endif
-				{
-					printf("\nRunning Job #%d", i_job+1);
-					if (filelist.used){
-						printf(":\n");
-						printf("    Grid map file: %s\n",  mypars.fldfile);
-						printf("    Ligand file: %s\n", mypars.ligandfile); fflush(stdout);
-						if(mypars.flexresfile)
-							printf("    Flexible residue: %s\n", mypars.flexresfile);
-						fflush(stdout);
-					} else printf("\n");
-					// End idling timer, start exec timer
-					sim_state.idle_time = seconds_since(idle_timer);
-					start_timer(exec_timer);
-					// Dock
-					error_in_docking = docking_with_gpu(&(mygrid), floatgrids.data(), &(mypars), &(myligand_init), &(myxrayligand), profiler.p[(get_profiles ? i_job : 0)], &argc, argv, sim_state, cData[dev_nr], tData[dev_nr], filelist.preload_maps);
-					// End exec timer, start idling timer
-					sim_state.exec_time = seconds_since(exec_timer);
-					start_timer(idle_timer);
-				}
-
 				if (error_in_docking!=0){
 					// If error encountered: Set error flag to 1; Add to count of finished jobs
 					// Set back to setup stage rather than moving to processing stage so a different job will be set up
-					printf("\n\nError in docking_with_gpu, stopped Job #%d.\n",i_job+1);
+					para_printf("\nError in docking_with_gpu, stopped Job #%d.\n",i_job+1);
 					err[i_job] = 1;
 					continue;
 				} else { // Successful run
@@ -373,23 +433,36 @@ int main(int argc, char* argv[])
 					#pragma omp atomic update
 #endif
 					total_exec_time+=sim_state.exec_time;
-					printf("\nJob #%d took %.3f sec after waiting %.3f sec for setup\n\n", i_job+1, sim_state.exec_time, sim_state.idle_time);
+					para_printf("\nJob #%d took %.3f sec after waiting %.3f sec for setup\n\n", i_job+1, sim_state.exec_time, sim_state.idle_time);
 					if (get_profiles && filelist.used){
 						// Detailed timing information to .timing
 						profiler.p[i_job].exec_time = sim_state.exec_time;
 					}
 #endif
 				}
+#ifdef USE_PIPELINE
+				if(nr_devices>1){
+					#pragma omp critical
+					{
+						printf("%s", output->c_str());
+						fflush(stdout);
+					}
+					delete output;
+				}
+#endif
 			}
-
+#endif
 			// Post-processing
 #ifdef USE_PIPELINE
 			if(!mypars.xml2dlg){
+#ifndef TOOLMODE
+				if(nr_devices>1) tData[dev_nr].device_busy = false;
+#endif
 				printf ("(Thread %d is processing Job #%d)\n",t_id,i_job+1); fflush(stdout);
 			}
 #endif
 			start_timer(processing_timer);
-			process_result(&(mygrid), floatgrids.data(), &(mypars), &(myligand_init), &(myxrayligand), &argc,argv, sim_state);
+			process_result(mygrid, &(mypars), &(myligand_init), &(myxrayligand), &argc,argv, sim_state);
 #ifdef USE_PIPELINE
 			#pragma omp atomic update
 #endif
@@ -402,9 +475,6 @@ int main(int argc, char* argv[])
 				if(mypars.flexresfile) free(mypars.flexresfile);
 				if(mypars.xrayligandfile) free(mypars.xrayligandfile);
 				if(mypars.resname) free(mypars.resname);
-				if(mygrid.grid_file_path) free(mygrid.grid_file_path);
-				if(mygrid.receptor_name) free(mygrid.receptor_name);
-				if(mygrid.map_base_name) free(mygrid.map_base_name);
 			}
 		} // end of for loop
 		if(!filelist.used){
@@ -415,39 +485,41 @@ int main(int argc, char* argv[])
 			if(mypars.flexresfile) free(mypars.flexresfile);
 			if(mypars.xrayligandfile) free(mypars.xrayligandfile);
 			if(mypars.resname) free(mypars.resname);
-			if(mygrid.grid_file_path) free(mygrid.grid_file_path);
-			if(mygrid.receptor_name) free(mygrid.receptor_name);
-			if(mygrid.map_base_name) free(mygrid.map_base_name);
 		}
 	} // end of parallel section
 	if(initial_pars.xml2dlg && !initial_pars.dlg2stdout && (n_files>100)) printf("\n\n"); // finish progress bar
 	
 #ifndef _WIN32
 	// Total time measurement
-	printf("Run time of entire job set (%d file%s): %.3f sec", n_files, n_files>1?"s":"", seconds_since(time_start));
+	printf("Run time of entire job set (%d file%s): %.3f sec\n", n_files, n_files>1?"s":"", seconds_since(time_start));
 #ifdef USE_PIPELINE
 	if(n_files>1){
-		printf("\nSavings from multithreading: %.3f sec",(total_setup_time+total_processing_time+total_exec_time) - seconds_since(time_start));
-		//if (filelist.preload_maps) printf("\nSavings from receptor reuse: %.3f sec * avg_maps_used/n_maps",receptor_reuse_time*n_files);
-		printf("\nIdle time of execution thread: %.3f sec",seconds_since(time_start) - total_exec_time);
+		printf("Savings from multithreading: %.3f sec\n",(total_setup_time+total_processing_time+total_exec_time) - seconds_since(time_start));
+		if(!initial_pars.xml2dlg) // in xml2dlg mode, there's only "idle time" (aka overlapped processing)
+			printf("Idle time of execution thread: %.3f sec\n",seconds_since(time_start) - total_exec_time);
 		if (get_profiles && filelist.used && !initial_pars.xml2dlg) // output profile with filelist name or dpf file name (depending on what is available)
 			profiler.write_profiles_to_file((filelist.filename!=NULL) ? filelist.filename : initial_pars.dpffile);
-	} else printf("\nProcessing time: %.3f sec",total_processing_time);
+	} else printf("Processing time: %.3f sec\n",total_processing_time);
 #else
-	printf("\nProcessing time: %.3f sec",total_processing_time);
+	printf("Processing time: %.3f sec\n",total_processing_time);
+#endif
 #endif
+#ifndef TOOLMODE
+	for(int i=0; i<nr_devices; i++){
+#ifdef USE_PIPELINE
+		omp_destroy_lock(&gpu_locks[i]);
 #endif
-	if(!initial_pars.xml2dlg)
-		for(int i=0; i<nr_devices; i++)
+		if(!initial_pars.xml2dlg)
 			finish_gpu_from_docking(cData[i],tData[i]);
-
+	}
+#endif
 	// Alert user to ligands that failed to complete
 	int n_errors=0;
 	for (int i=0; i<n_files; i++){
 		if (err[i]==1){
 			if (filelist.used){
-				if (n_errors==0) printf("\nWARNING: The following jobs were not successful:");
-				printf("\nJob %d: %s\n", i, filelist.ligand_files[i].c_str());
+				if (n_errors==0) printf("\nWarning: The following jobs were not successful:\n");
+				printf("         Job %d: %s\n", i, filelist.ligand_files[i].c_str());
 			} else {
 				printf("\nThe job was not successful.\n");
 			}
diff --git a/host/src/miscellaneous.cpp b/host/src/miscellaneous.cpp
index 47f9aaf3..7d5acee3 100644
--- a/host/src/miscellaneous.cpp
+++ b/host/src/miscellaneous.cpp
@@ -25,6 +25,51 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 #include "miscellaneous.h"
 
+float map2float(const char* c)
+// This function converts what we typically find in an autogrid map file into a
+// floating point number - just a bit quicker than the usual sscanf()
+// -> due to using 32-bit integers this function is limited to 9 digits for both
+//    the whole number and the fractional part - a safety check is performed with
+//    sscanf() used as the fallback
+{
+	float result;
+	bool negative = false;                       // example: -123.456
+	if(*c == '-'){                               // *c = '-'
+		negative = true;                     // => negative = true
+		c++;
+	}
+	// safety check
+	int len = strlen(c);
+	if(len>9){ // no potential issues at or below 9 digits in total
+		const char* dp = strchr(c,'.');
+		if(dp){
+			int d = dp-c;
+			if((d>9) || (len-d>9)){ // fall back to sscanf() if numbers are going to be too big for integers
+				sscanf(c, "%f", &result);
+				if(negative) return -result;
+				return result;
+			}
+		}
+	}
+	int number = 0;                              // 1. *c = '1': number = 0*10  + 1 = 1
+	while((*c >= '0') && (*c <= '9')){           // 2. *c = '2': number = 1*10  + 2 = 12
+		number = number * 10 + (*c - '0');   // 3. *c = '3': number = 12*10 + 3 = 123
+		c++;                                 // 4. *c = ','
+	}
+	if(*c == '.') c++; // jump over decimal point
+	int decimal = 0;
+	int denom = 1;
+	while((*c >= '0') && (*c <= '9')){           // 1. *c = '4': decimal = 0*10  + 4 = 4,   denom = 10
+		decimal = decimal * 10 + (*c - '0'); // 2. *c = '5': decimal = 4*10  + 5 = 45,  denom = 100
+		denom *= 10;                         // 3. *c = '6': decimal = 45*10 + 6 = 456, denom = 1000
+		c++;
+	}
+	// use more expensive division only once
+	result = (float)number + (float)decimal/((float)denom);
+	if(negative) return -result;
+	return result;
+}
+
 int float2fracint(double toconv, int frac)
 // The function converts a float value to a fixed pont fractional number in (32-frac).frac format,
 // and returns it as an integer.
@@ -57,6 +102,18 @@ double distance(const double point1 [], const double point2 [])
 	return sqrt(sub1*sub1 + sub2*sub2 + sub3*sub3);
 }
 
+double distance2(const double point1 [], const double point2 [])
+// Returns the square distance between point1 and point2.
+// The arrays have to store the x, y and z coordinates of the
+// point, respectively.
+{
+	double sub1, sub2, sub3;
+	sub1 = point1 [0] - point2 [0];
+	sub2 = point1 [1] - point2 [1];
+	sub3 = point1 [2] - point2 [2];
+	return sub1*sub1 + sub2*sub2 + sub3*sub3;
+}
+
 void vec_point2line(const double point [], const double line_pointA [], const double line_pointB [], double vec [])
 // The function calculates the vector which moves a point given by the first parameter to its perpendicular projection
 // on a line given by to of its points (line_pointA and line_pointB parameters). The result vector is the vec parameter.
@@ -200,6 +257,21 @@ void rotate(double point [], const double movvec [], const double normvec [], co
 		        point [0], point [1], point [2]);
 }
 
+std::string get_filepath(const char* filename)
+{
+	#ifndef _WIN32
+	char* ts1 = strdup(filename);
+	std::string result = dirname(ts1);
+	free(ts1);
+	return result;
+	#else
+	char drive_tmp[_MAX_DRIVE];
+	char path_tmp[_MAX_DIR];
+	_splitpath(filename, drive_tmp, path_tmp, NULL, NULL);
+	return drive_tmp + path_tmp;
+	#endif
+}
+
 #if 0
 // -------------------------------------------------------------------
 // Replacing rotation genes: from spherical space to Shoemake space
@@ -362,13 +434,13 @@ double angle_of_vectors(const double vector1 [], const double vector2 [])
 
 	scalmul = 0;
 
-	len_vec1 = distance(vector1, zerovec);
-	len_vec2 = distance(vector2, zerovec);
+	len_vec1 = distance2(vector1, zerovec);
+	len_vec2 = distance2(vector2, zerovec);
 
 	for (i=0; i<3; i++)
 		scalmul += vector1 [i]*vector2 [i];
 
-	temp = scalmul/(len_vec1*len_vec2);
+	temp = scalmul/sqrt(len_vec1*len_vec2);
 
 	if (temp > 1)  temp =  1;
 	if (temp < -1) temp = -1;
@@ -385,34 +457,6 @@ void vec_crossprod(const double vector1 [], const double vector2 [], double cros
 	crossprodvec [2] = vector1 [0]*vector2 [1] - vector1 [1]*vector2 [0];
 }
 
-void get_trilininterpol_weights(double weights [][2][2], const double* dx, const double* dy, const double* dz)
-// The function calculates the weights for trilinear interpolation based on the location of the point inside
-// the cube which is given by the second, third and fourth parameters.
-{
-	weights [0][0][0] = (1-(*dx))*(1-(*dy))*(1-(*dz));
-	weights [1][0][0] = (*dx)*(1-(*dy))*(1-(*dz));
-	weights [0][1][0] = (1-(*dx))*(*dy)*(1-(*dz));
-	weights [1][1][0] = (*dx)*(*dy)*(1-(*dz));
-	weights [0][0][1] = (1-(*dx))*(1-(*dy))*(*dz);
-	weights [1][0][1] = (*dx)*(1-(*dy))*(*dz);
-	weights [0][1][1] = (1-(*dx))*(*dy)*(*dz);
-	weights [1][1][1] = (*dx)*(*dy)*(*dz);
-}
-
-void get_trilininterpol_weights_f(float weights [][2][2], const float* dx, const float* dy, const float* dz)
-// The function calculates the weights for trilinear interpolation based on the location of the point inside
-// the cube which is given by the second, third and fourth parameters.
-{
-	weights [0][0][0] = (1-(*dx))*(1-(*dy))*(1-(*dz));
-	weights [1][0][0] = (*dx)*(1-(*dy))*(1-(*dz));
-	weights [0][1][0] = (1-(*dx))*(*dy)*(1-(*dz));
-	weights [1][1][0] = (*dx)*(*dy)*(1-(*dz));
-	weights [0][0][1] = (1-(*dx))*(1-(*dy))*(*dz);
-	weights [1][0][1] = (*dx)*(1-(*dy))*(*dz);
-	weights [0][1][1] = (1-(*dx))*(*dy)*(*dz);
-	weights [1][1][1] = (*dx)*(*dy)*(*dz);
-}
-
 void print_binary_string(unsigned long long to_print)
 // The function prints out the value of to_print parameter to the standart io as a binary number.
 {
diff --git a/host/src/performdocking.cpp.Cuda b/host/src/performdocking.cpp.Cuda
index 8e7a5a07..54300ab8 100644
--- a/host/src/performdocking.cpp.Cuda
+++ b/host/src/performdocking.cpp.Cuda
@@ -93,20 +93,25 @@ double elapsed_seconds(
 	return std::chrono::duration_cast<FloatingPointSeconds>(end - start).count();
 }
 
-void copy_map_to_gpu(
-                     GpuTempData&      tData,
-                     std::vector<Map>& all_maps,
-                     int               t,
-                     int               size_of_one_map
-                    )
+std::vector<int> get_gpu_pool()
 {
-	cudaError_t status = cudaMemcpy(tData.pMem_fgrids+t*size_of_one_map,all_maps[t].grid.data(),sizeof(float)*size_of_one_map, cudaMemcpyHostToDevice);
-	if (status != cudaSuccess) {
-		printf("%s %s\n", "pMem_fgrids: failed to upload maps to GPU memory.\n", cudaGetErrorString(status));
-		assert(0);
+	int gpuCount=0;
+	cudaError_t status;
+	status = cudaGetDeviceCount(&gpuCount);
+	RTERROR(status, "cudaGetDeviceCount failed");
+	std::vector<int> result;
+	cudaDeviceProp props;
+	for(unsigned int i=0; i<gpuCount; i++){
+		RTERROR(cudaGetDeviceProperties(&props,i),"cudaGetDeviceProperties failed");
+		if(props.major>=3) result.push_back(i);
+	}
+	if (result.size() == 0)
+	{
+		printf("No CUDA devices with compute capability >= 3.0 found, exiting.\n");
 		cudaDeviceReset();
 		exit(-1);
 	}
+	return result;
 }
 
 void setup_gpu_for_docking(
@@ -114,12 +119,15 @@ void setup_gpu_for_docking(
                            GpuTempData& tData
                           )
 {
-	if(cData.devnum<-1) return; // device already setup
+	cudaError_t status;
+	if(cData.devnum<-1){
+		status = cudaSetDevice(cData.devid);
+		return; // device already setup
+	}
 	auto const t0 = std::chrono::steady_clock::now();
 
 	// Initialize CUDA
 	int gpuCount=0;
-	cudaError_t status;
 	status = cudaGetDeviceCount(&gpuCount);
 	RTERROR(status, "cudaGetDeviceCount failed");
 	if (gpuCount == 0)
@@ -141,11 +149,13 @@ void setup_gpu_for_docking(
 	cudaDeviceProp props;
 	RTERROR(cudaGetDevice(&(cData.devnum)),"cudaGetDevice failed");
 	RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"cudaGetDeviceProperties failed");
-	printf("Cuda device:                              %s",props.name);
-	if(gpuCount>1) printf(" (#%d / %d)",cData.devnum+1,gpuCount);
-	printf("\n");
+	tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too
+	strcpy(tData.device_name, props.name);
+	if(gpuCount>1) sprintf(&tData.device_name[strlen(props.name)], " (#%d / %d)",cData.devnum+1,gpuCount);
+	printf("Cuda device:                              %s\n",tData.device_name);
 	RTERROR(cudaMemGetInfo(&freemem,&totalmem), "cudaGetMemInfo failed");
-	printf("Available memory on device:               %llu MB (total: %llu MB)\n",(freemem>>20),(totalmem>>20));
+	printf("Available memory on device:               %lu MB (total: %lu MB)\n",(freemem>>20),(totalmem>>20));
+	cData.devid=cData.devnum;
 	cData.devnum=-2;
 #ifdef SET_CUDA_PRINTF_BUFFER
 	status = cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 200000000ull);
@@ -153,37 +163,17 @@ void setup_gpu_for_docking(
 #endif
 	auto const t1 = std::chrono::steady_clock::now();
 	printf("\nCUDA Setup time %fs\n", elapsed_seconds(t0 ,t1));
-	size_t sz_interintra_const      = MAX_NUM_OF_ATOMS*sizeof(float) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(char);
-
-	size_t sz_intracontrib_const    = 2*MAX_INTRAE_CONTRIBUTORS*sizeof(uint32_t);
-
-	size_t sz_intra_const           = MAX_NUM_OF_ATYPES*sizeof(unsigned int) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(unsigned short int) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*sizeof(float);
-
-	size_t sz_rotlist_const         = MAX_NUM_OF_ROTATIONS*sizeof(int);
-
-	size_t sz_conform_const         = 3*MAX_NUM_OF_ATOMS*sizeof(float) +
-	                                  3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                                  3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                                  4*MAX_NUM_OF_RUNS*sizeof(float);
+
 	// Allocate kernel constant GPU memory
-	status = cudaMalloc((void**)&cData.pKerconst_interintra, sz_interintra_const);
+	status = cudaMalloc((void**)&cData.pKerconst_interintra, sizeof(kernelconstant_interintra));
 	RTERROR(status, "cData.pKerconst_interintra: failed to allocate GPU memory.\n");
-	status = cudaMalloc((void**)&cData.pKerconst_intracontrib, sz_intracontrib_const);
+	status = cudaMalloc((void**)&cData.pKerconst_intracontrib, sizeof(kernelconstant_intracontrib));
 	RTERROR(status, "cData.pKerconst_intracontrib: failed to allocate GPU memory.\n");
-	status = cudaMalloc((void**)&cData.pKerconst_intra, sz_intra_const);
+	status = cudaMalloc((void**)&cData.pKerconst_intra, sizeof(kernelconstant_intra));
 	RTERROR(status, "cData.pKerconst_intra: failed to allocate GPU memory.\n");
-	status = cudaMalloc((void**)&cData.pKerconst_rotlist, sz_rotlist_const);
+	status = cudaMalloc((void**)&cData.pKerconst_rotlist, sizeof(kernelconstant_rotlist));
 	RTERROR(status, "cData.pKerconst_rotlist: failed to allocate GPU memory.\n");
-	status = cudaMalloc((void**)&cData.pKerconst_conform, sz_conform_const);
+	status = cudaMalloc((void**)&cData.pKerconst_conform, sizeof(kernelconstant_conform));
 	RTERROR(status, "cData.pKerconst_conform: failed to allocate GPU memory.\n");
 
 	// Allocate mem data
@@ -209,8 +199,8 @@ void setup_gpu_for_docking(
 	RTERROR(status, "cData.pMem_dependence_on_rotangle_const: failed to upload to GPU memory.\n");
 
 	// Allocate temporary data - JL TODO - Are these sizes correct?
-	if(cData.preload_gridsize>0){
-		status = cudaMalloc((void**)&(tData.pMem_fgrids), cData.preload_gridsize * (sizeof(float)) * (ATYPE_NUM+2));
+	if(cData.preallocated_gridsize>0){
+		status = cudaMalloc((void**)&(tData.pMem_fgrids), cData.preallocated_gridsize*sizeof(float));
 		RTERROR(status, "pMem_fgrids: failed to allocate GPU memory.\n");
 	}
 	size_t size_populations = MAX_NUM_OF_RUNS * MAX_POPSIZE * GENOTYPE_LENGTH_IN_GLOBMEM*sizeof(float);
@@ -226,7 +216,11 @@ void setup_gpu_for_docking(
 	status = cudaMalloc((void**)&(tData.pMem_evals_of_new_entities), MAX_POPSIZE*MAX_NUM_OF_RUNS*sizeof(int));
 	RTERROR(status, "pMem_evals_of_new_Entities: failed to allocate GPU memory.\n");
 	size_t size_evals_of_runs = MAX_NUM_OF_RUNS*sizeof(int);
+#if defined (MAPPED_COPY)
 	status = cudaMallocManaged((void**)&(tData.pMem_gpu_evals_of_runs), size_evals_of_runs, cudaMemAttachGlobal);
+#else
+	status = cudaMalloc((void**)&(tData.pMem_gpu_evals_of_runs), size_evals_of_runs);
+#endif
 	RTERROR(status, "pMem_gpu_evals_of_runs: failed to allocate GPU memory.\n");
 	size_t blocksPerGridForEachEntity = MAX_POPSIZE * MAX_NUM_OF_RUNS;
 	size_t size_prng_seeds = blocksPerGridForEachEntity * NUM_OF_THREADS_PER_BLOCK * sizeof(unsigned int);
@@ -238,6 +232,8 @@ void finish_gpu_from_docking(
                              GpuTempData& tData
                             )
 {
+	if(cData.devnum>-2) return; // device not set up
+	
 	cudaError_t status;
 	// Release all CUDA objects
 	// Constant objects
@@ -283,11 +279,11 @@ void finish_gpu_from_docking(
 	RTERROR(status, "cudaFree: error freeing pMem_gpu_evals_of_runs");
 	status = cudaFree(tData.pMem_prng_states);
 	RTERROR(status, "cudaFree: error freeing pMem_prng_states");
+	free(tData.device_name);
 }
 
 int docking_with_gpu(
                      const Gridinfo*        mygrid,
-                           float*           cpu_floatgrids,
                            Dockpars*        mypars,
                      const Liganddata*      myligand_init,
                      const Liganddata*      myxrayligand,
@@ -297,21 +293,18 @@ int docking_with_gpu(
                            SimulationState& sim_state,
                            GpuData&         cData,
                            GpuTempData&     tData,
-                           bool             floatgrids_preloaded
+                           std::string*     output
                     )
 /* The function performs the docking algorithm and generates the corresponding result files.
 parameter mygrid:
 		describes the grid
 		filled with get_gridinfo()
-parameter cpu_floatgrids:
-		points to the memory region containing the grids
-		filled with get_gridvalues_f()
 parameter mypars:
 		describes the docking parameters
 		filled with get_commandpars()
 parameter myligand_init:
 		describes the ligands
-		filled with get_liganddata()
+		filled with parse_liganddata()
 parameter myxrayligand:
 		describes the xray ligand
 		filled with get_xrayliganddata()
@@ -319,17 +312,18 @@ parameters argc and argv:
 		are the corresponding command line arguments parameter
 */
 {
+	char* outbuf;
+	if(output!=NULL) outbuf = (char*)malloc(256*sizeof(char));
+
 	auto const t1 = std::chrono::steady_clock::now();
 	cudaError_t status;
 
-
 	Liganddata myligand_reference;
 
 	float* cpu_init_populations;
 	float* cpu_final_populations;
 	unsigned int* cpu_prng_seeds;
 
-	size_t size_floatgrids;
 	size_t size_populations;
 	size_t size_energies;
 	size_t size_prng_seeds;
@@ -378,7 +372,7 @@ parameters argc and argv:
 	cpu_prng_seeds = (unsigned int*) malloc(size_prng_seeds);
 
 	LocalRNG r(mypars->seed);
-//	printf("RNG seed is %u\n", mypars->seed);
+//	para_printf("RNG seed is %u\n", mypars->seed);
 
 	for (i=0; i<blocksPerGridForEachEntity*threadsPerBlock; i++)
 		cpu_prng_seeds[i] = r.random_uint();
@@ -388,73 +382,48 @@ parameters argc and argv:
 	sim_state.cpu_evals_of_runs.resize(size_evals_of_runs);
 	memset(sim_state.cpu_evals_of_runs.data(), 0, size_evals_of_runs);
 
-	//preparing the constant data fields for the GPU
-	kernelconstant_interintra   KerConst_interintra;
-	kernelconstant_intracontrib KerConst_intracontrib;
-	kernelconstant_intra        KerConst_intra;
-	kernelconstant_rotlist      KerConst_rotlist;
-	kernelconstant_conform      KerConst_conform;
-	kernelconstant_grads        KerConst_grads;
+	// preparing the constant data fields for the GPU
+	kernelconstant_interintra*	KerConst_interintra = new kernelconstant_interintra;
+	kernelconstant_intracontrib*	KerConst_intracontrib = new kernelconstant_intracontrib;
+	kernelconstant_intra*		KerConst_intra = new kernelconstant_intra;
+	kernelconstant_rotlist*		KerConst_rotlist = new kernelconstant_rotlist;
+	kernelconstant_conform*		KerConst_conform = new kernelconstant_conform;
+	kernelconstant_grads*		KerConst_grads = new kernelconstant_grads;
 
 	if (prepare_const_fields_for_gpu(&myligand_reference, mypars,
-	                                 &KerConst_interintra,
-	                                 &KerConst_intracontrib,
-	                                 &KerConst_intra,
-	                                 &KerConst_rotlist,
-	                                 &KerConst_conform,
-	                                 &KerConst_grads) == 1)
-	{
+	                                 KerConst_interintra,
+	                                 KerConst_intracontrib,
+	                                 KerConst_intra,
+	                                 KerConst_rotlist,
+	                                 KerConst_conform,
+	                                 KerConst_grads) == 1) {
 		return 1;
 	}
 
-	size_t sz_interintra_const      = MAX_NUM_OF_ATOMS*sizeof(float) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                                  MAX_NUM_OF_ATOMS*sizeof(char);
-
-	size_t sz_intracontrib_const    = 2*MAX_INTRAE_CONTRIBUTORS*sizeof(uint32_t);
-
-	size_t sz_intra_const           = MAX_NUM_OF_ATYPES*sizeof(unsigned int) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(unsigned short int) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*sizeof(float) +
-	                                  MAX_NUM_OF_ATYPES*sizeof(float);
-
-	size_t sz_rotlist_const         = MAX_NUM_OF_ROTATIONS*sizeof(int);
-
-	size_t sz_conform_const         = 3*MAX_NUM_OF_ATOMS*sizeof(float) +
-	                                  3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                                  3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                                  4*MAX_NUM_OF_RUNS*sizeof(float);
-
 	// Upload kernel constant data - JL FIXME - Can these be moved once?
-	status = cudaMemcpy(cData.pKerconst_interintra, &KerConst_interintra, sz_interintra_const, cudaMemcpyHostToDevice);
+	status = cudaMemcpy(cData.pKerconst_interintra, KerConst_interintra, sizeof(kernelconstant_interintra), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pKerconst_interintra: failed to upload to GPU memory.\n");
-	status = cudaMemcpy(cData.pKerconst_intracontrib, &KerConst_intracontrib, sz_intracontrib_const, cudaMemcpyHostToDevice);
+	status = cudaMemcpy(cData.pKerconst_intracontrib, KerConst_intracontrib, sizeof(kernelconstant_intracontrib), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pKerconst_intracontrib: failed to upload to GPU memory.\n");
-	status = cudaMemcpy(cData.pKerconst_intra, &KerConst_intra, sz_intra_const, cudaMemcpyHostToDevice);
+	status = cudaMemcpy(cData.pKerconst_intra, KerConst_intra, sizeof(kernelconstant_intra), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pKerconst_intra: failed to upload to GPU memory.\n");
-	status = cudaMemcpy(cData.pKerconst_rotlist, &KerConst_rotlist, sz_rotlist_const, cudaMemcpyHostToDevice);
+	status = cudaMemcpy(cData.pKerconst_rotlist, KerConst_rotlist, sizeof(kernelconstant_rotlist), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pKerconst_rotlist: failed to upload to GPU memory.\n");
-	status = cudaMemcpy(cData.pKerconst_conform, &KerConst_conform, sz_conform_const, cudaMemcpyHostToDevice);
+	status = cudaMemcpy(cData.pKerconst_conform, KerConst_conform, sizeof(kernelconstant_conform), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pKerconst_conform: failed to upload to GPU memory.\n");
-	cudaMemcpy(cData.pMem_rotbonds_const, KerConst_grads.rotbonds, 2*MAX_NUM_OF_ROTBONDS*sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(cData.pMem_rotbonds_const, KerConst_grads->rotbonds, sizeof(KerConst_grads->rotbonds), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pMem_rotbonds_const: failed to upload to GPU memory.\n");
-	cudaMemcpy(cData.pMem_rotbonds_atoms_const, KerConst_grads.rotbonds_atoms, MAX_NUM_OF_ATOMS*MAX_NUM_OF_ROTBONDS*sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(cData.pMem_rotbonds_atoms_const, KerConst_grads->rotbonds_atoms, sizeof(KerConst_grads->rotbonds_atoms), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pMem_rotbonds_atoms_const: failed to upload to GPU memory.\n");
-	cudaMemcpy(cData.pMem_num_rotating_atoms_per_rotbond_const, KerConst_grads.num_rotating_atoms_per_rotbond, MAX_NUM_OF_ROTBONDS*sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(cData.pMem_num_rotating_atoms_per_rotbond_const, KerConst_grads->num_rotating_atoms_per_rotbond, sizeof(KerConst_grads->num_rotating_atoms_per_rotbond), cudaMemcpyHostToDevice);
 	RTERROR(status, "cData.pMem_num_rotating_atoms_per_rotbond_const failed to upload to GPU memory.\n");
 
-	// allocating GPU memory for populations, floatgirds,
-	// energies, evaluation counters and random number generator states
-	size_floatgrids = 4 * (sizeof(float)) * (mygrid->num_of_atypes+2) * (mygrid->size_xyz[0]) * (mygrid->size_xyz[1]) * (mygrid->size_xyz[2]);
-	if(cData.preload_gridsize==0){
-		status = cudaMalloc((void**)&(tData.pMem_fgrids), size_floatgrids);
+	// allocating GPU memory for grids, populations, energies,
+	// evaluation counters and random number generator states
+	if(cData.preallocated_gridsize==0){
+		status = cudaMalloc((void**)&(tData.pMem_fgrids), mygrid->grids.size()*sizeof(float));
 		RTERROR(status, "pMem_fgrids: failed to allocate GPU memory.\n");
 	}
-
 	// Flippable pointers
 	float* pMem_conformations_current = tData.pMem_conformations1;
 	float* pMem_conformations_next = tData.pMem_conformations2;
@@ -472,10 +441,8 @@ parameters argc and argv:
 	cData.warpbits = 5;
 
 	// Upload data
-	if(!floatgrids_preloaded){
-		status = cudaMemcpy(tData.pMem_fgrids, cpu_floatgrids, size_floatgrids, cudaMemcpyHostToDevice);
-		RTERROR(status, "pMem_fgrids: failed to upload to GPU memory.\n");
-	}
+	status = cudaMemcpy(tData.pMem_fgrids, mygrid->grids.data(), mygrid->grids.size()*sizeof(float), cudaMemcpyHostToDevice);
+	RTERROR(status, "pMem_fgrids: failed to upload to GPU memory.\n");
 	status = cudaMemcpy(pMem_conformations_current, cpu_init_populations, size_populations, cudaMemcpyHostToDevice);
 	RTERROR(status, "pMem_conformations_current: failed to upload to GPU memory.\n");
 	status = cudaMemcpy(tData.pMem_gpu_evals_of_runs, sim_state.cpu_evals_of_runs.data(), size_evals_of_runs, cudaMemcpyHostToDevice);
@@ -549,7 +516,7 @@ parameters argc and argv:
 //			heur_evals = (unsigned long)ceil(1000 * pow(2.0,0.5 * myligand_init->num_of_rotbonds + 6.0));
 			heur_evals = (unsigned long)ceil(64000 * pow(2.0, (0.5 - 0.2 * myligand_init->num_of_rotbonds/(20.0f + myligand_init->num_of_rotbonds)) * myligand_init->num_of_rotbonds));
 			} else{
-				printf("\nError: LS method \"%s\" is not supported by heuristics.\n       Please choose Solis-Wets (sw), Adadelta (ad),\n       or switch off the heuristics.\n",mypars->ls_method);
+				para_printf("\nError: LS method \"%s\" is not supported by heuristics.\n       Please choose Solis-Wets (sw), Adadelta (ad),\n       or switch off the heuristics.\n",mypars->ls_method);
 				exit(-1);
 			}
 		}
@@ -560,9 +527,9 @@ parameters argc and argv:
 		// => e = 1/19*hm
 		// at hm = 50 M => e0 = 2.63 M where e becomes less than 95% (about 11 torsions)
 		mypars->num_of_energy_evals = (unsigned long)ceil(heur_evals*(float)mypars->heuristics_max/(mypars->heuristics_max+heur_evals));
-		printf("    Using heuristics: (capped) number of evaluations set to %lu\n",mypars->num_of_energy_evals);
+		para_printf("    Using heuristics: (capped) number of evaluations set to %lu\n",mypars->num_of_energy_evals);
 		if (mypars->nev_provided && (mypars->num_of_energy_evals>nev)){
-			printf("    Overriding heuristics, setting number of evaluations to --nev = %lu instead.\n",nev);
+			para_printf("    Overriding heuristics, setting number of evaluations to --nev = %lu instead.\n",nev);
 			mypars->num_of_energy_evals = nev;
 			profile.capped = true;
 		}
@@ -572,14 +539,14 @@ parameters argc and argv:
 		float min_frac = a/(1+cap_fraction*cap_fraction*(a/(a-1.0f)-1.0f))+1.0f-a;
 		min_as_evals = (unsigned long)ceil(mypars->num_of_energy_evals*min_frac)*mypars->num_of_runs;
 		if(cap_fraction<0.5f){
-			printf("    Warning: The set number of evals is %.2f%% of the uncapped heuristics estimate of %lu evals.\n",cap_fraction*100.0f,heur_evals);
-			printf("             This means this docking may not be able to converge. Increasing ");
+			para_printf("    Warning: The set number of evals is %.2f%% of the uncapped heuristics estimate of %lu evals.\n",cap_fraction*100.0f,heur_evals);
+			para_printf("             This means this docking may not be able to converge. Increasing ");
 			if (mypars->nev_provided && (mypars->num_of_energy_evals>nev))
-				printf("--nev");
+				para_printf("--nev");
 			else
-				printf("--heurmax");
-			printf(" may improve\n             convergence but will also increase runtime.\n");
-			if(mypars->autostop) printf("             AutoStop will not stop before %.2f%% (%lu) of the set number of evaluations.\n",min_frac*100.0f,min_as_evals/mypars->num_of_runs);
+				para_printf("--heurmax");
+			para_printf(" may improve\n             convergence but will also increase runtime.\n");
+			if(mypars->autostop) para_printf("             AutoStop will not stop before %.2f%% (%lu) of the set number of evaluations.\n",min_frac*100.0f,min_as_evals/mypars->num_of_runs);
 		}
 	}
 	
@@ -594,13 +561,13 @@ parameters argc and argv:
 		strcpy(method_chosen,"ADAM (adam)");
 	}
 	else{
-		printf("\nError: LS method %s is not (yet) supported in the Cuda version.\n",mypars->ls_method);
+		para_printf("\nError: LS method %s is not (yet) supported in the Cuda version.\n",mypars->ls_method);
 		exit(-1);
 	}
-	printf("    Local-search chosen method is: %s\n", (cData.dockpars.lsearch_rate == 0.0f)? "GA" : method_chosen);
+	para_printf("    Local-search chosen method is: %s\n", (cData.dockpars.lsearch_rate == 0.0f)? "GA" : method_chosen);
 
 	if((mypars->initial_sw_generations>0) && (strcmp(mypars->ls_method, "sw") != 0))
-		printf("    Using Solis-Wets (sw) for the first %d generations.\n",mypars->initial_sw_generations);
+		para_printf("    Using Solis-Wets (sw) for the first %d generations.\n",mypars->initial_sw_generations);
 
 	// Get profile for timing
 	profile.adadelta=(strcmp(mypars->ls_method, "ad")==0);
@@ -609,8 +576,8 @@ parameters argc and argv:
 	profile.num_rotbonds = myligand_init->num_of_rotbonds;
 
 	/*
-	printf("dockpars.num_of_intraE_contributors:%u\n", dockpars.num_of_intraE_contributors);
-	printf("dockpars.rotbondlist_length:%u\n", dockpars.rotbondlist_length);
+	para_printf("dockpars.num_of_intraE_contributors:%u\n", dockpars.num_of_intraE_contributors);
+	para_printf("dockpars.rotbondlist_length:%u\n", dockpars.rotbondlist_length);
 	*/
 
 	clock_start_docking = clock();
@@ -618,7 +585,7 @@ parameters argc and argv:
 	SetKernelsGpuData(&cData);
 
 #ifdef DOCK_DEBUG
-	printf("\n");
+	para_printf("\n");
 	// Main while-loop iterarion counter
 	unsigned int ite_cnt = 0;
 #endif
@@ -629,10 +596,10 @@ parameters argc and argv:
 			  intrapair_cnt<dockpars.num_of_intraE_contributors;
 			  intrapair_cnt++) {
 		if (intrapair_cnt == 0) {
-			printf("%-10s %-10s %-10s\n", "#pair", "#atom1", "#atom2");
+			para_printf("%-10s %-10s %-10s\n", "#pair", "#atom1", "#atom2");
 		}
 
-		printf ("%-10u %-10u %-10u\n", intrapair_cnt,
+		para_printf ("%-10u %-10u %-10u\n", intrapair_cnt,
 		        KerConst.intraE_contributors_const[3*intrapair_cnt],
 		        KerConst.intraE_contributors_const[3*intrapair_cnt+1]);
 	}
@@ -642,7 +609,7 @@ parameters argc and argv:
 	uint32_t kernel1_gxsize = blocksPerGridForEachEntity;
 	uint32_t kernel1_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8lu %10s %4u\n", "K_INIT", "gSize: ", kernel1_gxsize, "lSize: ", kernel1_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8lu %10s %4u\n", "K_INIT", "gSize: ", kernel1_gxsize, "lSize: ", kernel1_lxsize); fflush(stdout);
 #endif
 	// End of Kernel1
 
@@ -650,7 +617,7 @@ parameters argc and argv:
 	uint32_t kernel2_gxsize = blocksPerGridForEachRun;
 	uint32_t kernel2_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8lu %10s %4u\n", "K_EVAL", "gSize: ", kernel2_gxsize, "lSize: ",  kernel2_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8lu %10s %4u\n", "K_EVAL", "gSize: ", kernel2_gxsize, "lSize: ",  kernel2_lxsize); fflush(stdout);
 #endif
 	// End of Kernel2
 
@@ -658,59 +625,63 @@ parameters argc and argv:
 	uint32_t kernel4_gxsize = blocksPerGridForEachEntity;
 	uint32_t kernel4_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8u %10s %4u\n", "K_GA_GENERATION", "gSize: ",  kernel4_gxsize, "lSize: ", kernel4_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8u %10s %4u\n", "K_GA_GENERATION", "gSize: ",  kernel4_gxsize, "lSize: ", kernel4_lxsize); fflush(stdout);
 #endif
 	// End of Kernel4
 
-	uint32_t kernel3_gxsize, kernel3_lxsize;
-	uint32_t kernel5_gxsize, kernel5_lxsize;
-	uint32_t kernel6_gxsize, kernel6_lxsize;
-	uint32_t kernel7_gxsize, kernel7_lxsize;
-	uint32_t kernel8_gxsize, kernel8_lxsize;
+	uint32_t kernel3_gxsize = 0;
+	uint32_t kernel3_lxsize = threadsPerBlock;
+/*
+	uint32_t kernel5_gxsize = 0;
+	uint32_t kernel5_lxsize = threadsPerBlock;
+	uint32_t kernel6_gxsize = 0;
+	uint32_t kernel6_lxsize = threadsPerBlock;
+*/
+	uint32_t kernel7_gxsize = 0;
+	uint32_t kernel7_lxsize = threadsPerBlock;
+	uint32_t kernel8_gxsize = 0;
+	uint32_t kernel8_lxsize = threadsPerBlock;
 	if (cData.dockpars.lsearch_rate != 0.0f) {
 
 		if ((strcmp(mypars->ls_method, "sw") == 0) || (mypars->initial_sw_generations>0)) {
 			// Kernel3
 			kernel3_gxsize = blocksPerGridForEachLSEntity;
-			kernel3_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_SOLISWETS", "gSize: ", kernel3_gxsize, "lSize: ", kernel3_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_SOLISWETS", "gSize: ", kernel3_gxsize, "lSize: ", kernel3_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel3
 		}
+/* SD and Fire are not currently supported by the Cuda version
 		if (strcmp(mypars->ls_method, "sd") == 0) {
 			// Kernel5
 			kernel5_gxsize = blocksPerGridForEachGradMinimizerEntity;
-			kernel5_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_SDESCENT", "gSize: ", kernel5_gxsize, "lSize: ", kernel5_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_SDESCENT", "gSize: ", kernel5_gxsize, "lSize: ", kernel5_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel5
 		}
 		if (strcmp(mypars->ls_method, "fire") == 0) {
 			// Kernel6
 			kernel6_gxsize = blocksPerGridForEachGradMinimizerEntity;
-			kernel6_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_FIRE", "gSize: ", kernel6_gxsize, "lSize: ", kernel6_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_FIRE", "gSize: ", kernel6_gxsize, "lSize: ", kernel6_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel6
 		}
+*/
 		if (strcmp(mypars->ls_method, "ad") == 0) {
 			// Kernel7
 			kernel7_gxsize = blocksPerGridForEachGradMinimizerEntity;
-			kernel7_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel7
 		}
 		if (strcmp(mypars->ls_method, "adam") == 0) {
 			// Kernel8
 			kernel8_gxsize = blocksPerGridForEachGradMinimizerEntity;
-			kernel8_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel8
 		}
@@ -718,27 +689,27 @@ parameters argc and argv:
 
 	// Kernel1
 	#ifdef DOCK_DEBUG
-		printf("\nExecution starts:\n\n");
-		printf("%-25s", "\tK_INIT");fflush(stdout);
+		para_printf("\nExecution starts:\n\n");
+		para_printf("%-25s", "\tK_INIT");fflush(stdout);
 		cudaDeviceSynchronize();
 	#endif
 	gpu_calc_initpop(kernel1_gxsize, kernel1_lxsize, pMem_conformations_current, pMem_energies_current);
 	//runKernel1D(command_queue,kernel1,kernel1_gxsize,kernel1_lxsize,&time_start_kernel,&time_end_kernel);
 	#ifdef DOCK_DEBUG
 		cudaDeviceSynchronize();
-		printf("%15s" ," ... Finished\n");fflush(stdout);
+		para_printf("%15s" ," ... Finished\n");fflush(stdout);
 	#endif
 	// End of Kernel1
 
 	// Kernel2
 	#ifdef DOCK_DEBUG
-		printf("%-25s", "\tK_EVAL");fflush(stdout);
+		para_printf("%-25s", "\tK_EVAL");fflush(stdout);
 	#endif
 	//runKernel1D(command_queue,kernel2,kernel2_gxsize,kernel2_lxsize,&time_start_kernel,&time_end_kernel);
 	gpu_sum_evals(kernel2_gxsize, kernel2_lxsize);
 	#ifdef DOCK_DEBUG
 		cudaDeviceSynchronize();
-		printf("%15s" ," ... Finished\n");fflush(stdout);
+		para_printf("%15s" ," ... Finished\n");fflush(stdout);
 	#endif
 	// End of Kernel2
 	// ===============================================================================
@@ -750,10 +721,10 @@ parameters argc and argv:
 	unsigned long total_evals;
 
 	auto const t2 = std::chrono::steady_clock::now();
-	printf("\nRest of Setup time %fs\n", elapsed_seconds(t1 ,t2));
+	para_printf("\nRest of Setup time %fs\n", elapsed_seconds(t1 ,t2));
 
 	//print progress bar
-	AutoStop autostop(mypars->pop_size, mypars->num_of_runs, mypars->stopstd, mypars->as_frequency);
+	AutoStop autostop(mypars->pop_size, mypars->num_of_runs, mypars->stopstd, mypars->as_frequency, output);
 #ifndef DOCK_DEBUG
 	if (mypars->autostop)
 	{
@@ -761,21 +732,23 @@ parameters argc and argv:
 	}
 	else
 	{
-		printf("\nExecuting docking runs:\n");
-		printf("        20%%        40%%       60%%       80%%       100%%\n");
-		printf("---------+---------+---------+---------+---------+\n");
+		para_printf("\nExecuting docking runs:\n");
+		para_printf("        20%%        40%%       60%%       80%%       100%%\n");
+		para_printf("---------+---------+---------+---------+---------+\n");
 	}
 #endif
 	curr_progress_cnt = 0;
 
-	// -------- Replacing with memory maps! ------------
+#if defined (MAPPED_COPY)
 	while ((progress = check_progress(tData.pMem_gpu_evals_of_runs, generation_cnt, mypars->num_of_energy_evals, mypars->num_of_generations, mypars->num_of_runs, total_evals)) < 100.0)
-	// -------- Replacing with memory maps! ------------
+#else
+	while ((progress = check_progress(sim_state.cpu_evals_of_runs.data(), generation_cnt, mypars->num_of_energy_evals, mypars->num_of_generations, mypars->num_of_runs, total_evals)) < 100.0)
+#endif
 	{
 		if (mypars->autostop) {
 			if (generation_cnt % mypars->as_frequency == 0) {
 				status = cudaMemcpy(sim_state.cpu_energies.data(), pMem_energies_current, size_energies, cudaMemcpyDeviceToHost);
-				RTERROR(status, "cudaMemcpy: couldn't downloaded pMem_energies_current");
+				RTERROR(status, "cudaMemcpy: couldn't download pMem_energies_current");
 				if (autostop.check_if_satisfactory(generation_cnt, sim_state.cpu_energies.data(), total_evals))
 					if (total_evals>min_as_evals)
 						break; // Exit loop when all conditions are satisfied
@@ -785,7 +758,7 @@ parameters argc and argv:
 		{
 #ifdef DOCK_DEBUG
 			ite_cnt++;
-			printf("\nLGA iteration # %u\n", ite_cnt);
+			para_printf("\nLGA iteration # %u\n", ite_cnt);
 			fflush(stdout);
 #endif
 			//update progress bar (bar length is 50)
@@ -795,74 +768,74 @@ parameters argc and argv:
 			while (curr_progress_cnt < new_progress_cnt) {
 				curr_progress_cnt++;
 #ifndef DOCK_DEBUG
-				printf("*");
+				para_printf("*");
 #endif
 				fflush(stdout);
 			}
 		}
 		// Kernel4
 		#ifdef DOCK_DEBUG
-			printf("%-25s", "\tK_GA_GENERATION");fflush(stdout);
+			para_printf("%-25s", "\tK_GA_GENERATION");fflush(stdout);
 		#endif
 
 		//runKernel1D(command_queue,kernel4,kernel4_gxsize,kernel4_lxsize,&time_start_kernel,&time_end_kernel);
 		gpu_gen_and_eval_newpops(kernel4_gxsize, kernel4_lxsize, pMem_conformations_current, pMem_energies_current, pMem_conformations_next, pMem_energies_next);
 		#ifdef DOCK_DEBUG
-			printf("%15s", " ... Finished\n");fflush(stdout);
+			para_printf("%15s", " ... Finished\n");fflush(stdout);
 		#endif
 		// End of Kernel4
 		if (cData.dockpars.lsearch_rate != 0.0f) {
 			if ((strcmp(mypars->ls_method, "sw") == 0) || ((strcmp(mypars->ls_method, "ad") == 0) && (generation_cnt<mypars->initial_sw_generations))) {
 				// Kernel3
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_SOLISWETS");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_SOLISWETS");fflush(stdout);
 				#endif
 				//runKernel1D(command_queue,kernel3,kernel3_gxsize,kernel3_lxsize,&time_start_kernel,&time_end_kernel);
 				gpu_perform_LS(kernel3_gxsize, kernel3_lxsize, pMem_conformations_next, pMem_energies_next);                
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel3
 			} else if (strcmp(mypars->ls_method, "sd") == 0) {
 				// Kernel5
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_SDESCENT");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_SDESCENT");fflush(stdout);
 				#endif
 				//runKernel1D(command_queue,kernel5,kernel5_gxsize,kernel5_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel5
 			} else if (strcmp(mypars->ls_method, "fire") == 0) {
 				// Kernel6
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_FIRE");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_FIRE");fflush(stdout);
 				#endif
 				//runKernel1D(command_queue,kernel6,kernel6_gxsize,kernel6_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel6
 			} else if (strcmp(mypars->ls_method, "ad") == 0) {
 				// Kernel7
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_ADADELTA");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_ADADELTA");fflush(stdout);
 				#endif
 				// runKernel1D(command_queue,kernel7,kernel7_gxsize,kernel7_lxsize,&time_start_kernel,&time_end_kernel);
 				gpu_gradient_minAD(kernel7_gxsize, kernel7_lxsize, pMem_conformations_next, pMem_energies_next);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel7
 			} else if (strcmp(mypars->ls_method, "adam") == 0) {
 				// Kernel8
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_ADAM");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_ADAM");fflush(stdout);
 				#endif
 				// runKernel1D(command_queue,kernel8,kernel8_gxsize,kernel8_lxsize,&time_start_kernel,&time_end_kernel);
 				gpu_gradient_minAdam(kernel8_gxsize, kernel8_lxsize, pMem_conformations_next, pMem_energies_next);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel8
 			}
@@ -871,23 +844,19 @@ parameters argc and argv:
 		// -------- Replacing with memory maps! ------------
 		// Kernel2
 		#ifdef DOCK_DEBUG
-			printf("%-25s", "\tK_EVAL");fflush(stdout);
+			para_printf("%-25s", "\tK_EVAL");fflush(stdout);
 		#endif
 		//runKernel1D(command_queue,kernel2,kernel2_gxsize,kernel2_lxsize,&time_start_kernel,&time_end_kernel);
 		gpu_sum_evals(kernel2_gxsize, kernel2_lxsize);
 
 		#ifdef DOCK_DEBUG
-			printf("%15s" ," ... Finished\n");fflush(stdout);
+			para_printf("%15s" ," ... Finished\n");fflush(stdout);
 		#endif
 		// End of Kernel2
 		// ===============================================================================
-		// -------- Replacing with memory maps! ------------
-#if defined (MAPPED_COPY)
-		//map_cpu_evals_of_runs = (int*) memMap(command_queue, mem_gpu_evals_of_runs, CL_MAP_READ, size_evals_of_runs);
-#else
-		cudaMemcpy(sim_state.cpu_evals_of_runs.data(), pMem_gpu_evals_of_runs, size_evals_of_runs, cudaMemcpyDeviceToHost);
+#if not defined (MAPPED_COPY)
+		cudaMemcpy(sim_state.cpu_evals_of_runs.data(), tData.pMem_gpu_evals_of_runs, size_evals_of_runs, cudaMemcpyDeviceToHost);
 #endif
-		// -------- Replacing with memory maps! ------------
 		generation_cnt++;
 		// ----------------------------------------------------------------------
 		// ORIGINAL APPROACH: switching conformation and energy pointers (Probably the best approach, restored)
@@ -914,7 +883,7 @@ parameters argc and argv:
 
 		// ----------------------------------------------------------------------
 		#ifdef DOCK_DEBUG
-			printf("\tProgress %.3f %%\n", progress);
+			para_printf("\tProgress %.3f %%\n", progress);
 			fflush(stdout);
 		#endif
 	} // End of while-loop
@@ -929,10 +898,10 @@ parameters argc and argv:
 		//update progress bar (bar length is 50)mem_num_of_rotatingatoms_per_rotbond_const
 		while (curr_progress_cnt < 50) {
 			curr_progress_cnt++;
-			printf("*");
+			para_printf("*");
 			fflush(stdout);
 		}
-		printf("\n");
+		para_printf("\n");
 	}
 
 	auto const t3 = std::chrono::steady_clock::now();
@@ -949,12 +918,12 @@ parameters argc and argv:
 
 	// Final autostop statistics output
 	if (mypars->autostop) autostop.output_final_stddev(generation_cnt, sim_state.cpu_energies.data(), total_evals);
-	printf("\nDocking time %fs\n", elapsed_seconds(t2, t3));
+	para_printf("\nDocking time %fs\n", elapsed_seconds(t2, t3));
 #if defined (DOCK_DEBUG)
 	for (int cnt_pop=0;cnt_pop<size_populations/sizeof(float);cnt_pop++)
-		printf("total_num_pop: %u, cpu_final_populations[%u]: %f\n",(unsigned int)(size_populations/sizeof(float)),cnt_pop,cpu_final_populations[cnt_pop]);
+		para_printf("total_num_pop: %u, cpu_final_populations[%u]: %f\n",(unsigned int)(size_populations/sizeof(float)),cnt_pop,cpu_final_populations[cnt_pop]);
 	for (int cnt_pop=0;cnt_pop<size_energies/sizeof(float);cnt_pop++)
-		printf("total_num_energies: %u, cpu_energies[%u]: %f\n",    (unsigned int)(size_energies/sizeof(float)),cnt_pop,sim_state.cpu_energies[cnt_pop]);
+		para_printf("total_num_energies: %u, cpu_energies[%u]: %f\n",    (unsigned int)(size_energies/sizeof(float)),cnt_pop,sim_state.cpu_energies[cnt_pop]);
 #endif
 
 	// Assign simulation results to sim_state
@@ -965,8 +934,17 @@ parameters argc and argv:
 
 	free(cpu_prng_seeds);
 
+	delete KerConst_interintra;
+	delete KerConst_intracontrib;
+	delete KerConst_intra;
+	delete KerConst_rotlist;
+	delete KerConst_conform;
+	delete KerConst_grads;
+
 	auto const t4 = std::chrono::steady_clock::now();
-	printf("\nShutdown time %fs\n", elapsed_seconds(t3, t4));
+	para_printf("\nShutdown time %fs\n", elapsed_seconds(t3, t4));
+	if(output!=NULL) free(outbuf);
+
 	return 0;
 }
 
diff --git a/host/src/performdocking.cpp.OpenCL b/host/src/performdocking.cpp.OpenCL
index f740280e..f43eb429 100644
--- a/host/src/performdocking.cpp.OpenCL
+++ b/host/src/performdocking.cpp.OpenCL
@@ -107,20 +107,28 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "stringify.h"
 #include "correct_grad_axisangle.h"
 
-void copy_map_to_gpu(
-                     GpuTempData&      tData,
-                     std::vector<Map>& all_maps,
-                     int               t,
-                     int               size_of_one_map
-                    )
+std::vector<int> get_gpu_pool()
 {
-	cl_int err;
-	err = clEnqueueWriteBuffer(tData.command_queue,tData.pMem_fgrids,true,t*size_of_one_map*sizeof(float),size_of_one_map*sizeof(float),all_maps[t].grid.data(),0,NULL,NULL);
-	if (err != CL_SUCCESS){
-		printf("pMem_fgrids: failed to upload maps to GPU memory. %d\n", err);
-		fflush(stdout);
+	cl_platform_id*  platform_id;
+	cl_device_id*    device_ids;
+	cl_uint platformCount;
+	cl_uint deviceCount;
+	std::vector<int> result;
+
+	// Get all available platforms
+	if (getPlatforms(&platform_id,&platformCount) == CL_SUCCESS)
+		for(unsigned int platform_nr=0; platform_nr<platformCount; platform_nr++)
+			// Get all devices of the given platform
+			if (getDevices(platform_id[platform_nr],platformCount,&device_ids,&deviceCount) == CL_SUCCESS)
+				for(unsigned int i=0; i<deviceCount; i++)
+					result.push_back(i+(platform_nr<<8));
+
+	if (result.size() == 0)
+	{
+		printf("No suitable OpenCL devices found, exiting.\n");
 		exit(-1);
 	}
+	return result;
 }
 
 void setup_gpu_for_docking(
@@ -229,15 +237,12 @@ void setup_gpu_for_docking(
 	device_id=device_ids[cData.devnum];
 	size_t dev_name_size;
 	clGetDeviceInfo(device_ids[cData.devnum], CL_DEVICE_NAME, 0, NULL, &dev_name_size);
-	char* device_name = (char*) malloc(dev_name_size);
-	clGetDeviceInfo(device_ids[cData.devnum], CL_DEVICE_NAME, dev_name_size, device_name, NULL);
-	printf("OpenCL device:                           %s",device_name);
-	if(deviceCount>1) printf(" (#%d / %d)",cData.devnum+1,deviceCount);
-	printf("\n");
+	tData.device_name = (char*) malloc(dev_name_size+32); // make sure array is large enough to hold device number text too
+	clGetDeviceInfo(device_ids[cData.devnum], CL_DEVICE_NAME, dev_name_size, tData.device_name, NULL);
+	if(deviceCount>1) sprintf(&tData.device_name[dev_name_size-1], " (#%d / %d)",cData.devnum+1,deviceCount);
+	printf("OpenCL device:                           %s\n",tData.device_name);
 	cData.devnum=-2;
 
-	free(device_name);
-
 	// Create context from first platform
 	if (createContext(platform_id[platform_nr],1,&device_id,&tData.context) != 0) exit(-1);
 
@@ -271,40 +276,18 @@ void setup_gpu_for_docking(
 // End of OpenCL Host Setup
 // =======================================================================
 
-	size_t sz_interintra_const   = MAX_NUM_OF_ATOMS*sizeof(float) + 
-	                               MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                               MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                               MAX_NUM_OF_ATOMS*sizeof(char);
-
-	size_t sz_intracontrib_const = 2*MAX_INTRAE_CONTRIBUTORS*sizeof(uint32_t);
-
-	size_t sz_intra_const        = MAX_NUM_OF_ATYPES*sizeof(unsigned int) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(unsigned short) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*sizeof(float);
-
-	size_t sz_rotlist_const      = MAX_NUM_OF_ROTATIONS*sizeof(int);
-
-	size_t sz_conform_const      = 3*MAX_NUM_OF_ATOMS*sizeof(float) +
-	                               3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                               3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                               4*MAX_NUM_OF_RUNS*sizeof(float);
-
 	// These constants are allocated in global memory since
 	// there is a limited number of constants that can be passed
 	// as arguments to kernel
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,sz_interintra_const,        &cData.mem_interintra_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,sz_intracontrib_const,      &cData.mem_intracontrib_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,sz_intra_const,             &cData.mem_intra_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,sz_rotlist_const,           &cData.mem_rotlist_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,sz_conform_const,           &cData.mem_conform_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, sizeof(kernelconstant_interintra),        &cData.mem_interintra_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, sizeof(kernelconstant_intracontrib),      &cData.mem_intracontrib_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, sizeof(kernelconstant_intra),             &cData.mem_intra_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, sizeof(kernelconstant_rotlist),           &cData.mem_rotlist_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, sizeof(kernelconstant_conform),           &cData.mem_conform_const);
 
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,2*MAX_NUM_OF_ROTBONDS*sizeof(int),                &cData.mem_rotbonds_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,MAX_NUM_OF_ATOMS*MAX_NUM_OF_ROTBONDS*sizeof(int), &cData.mem_rotbonds_atoms_const);
-	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,MAX_NUM_OF_ROTBONDS*sizeof(int),                  &cData.mem_num_rotating_atoms_per_rotbond_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, 2*MAX_NUM_OF_ROTBONDS*sizeof(int),                &cData.mem_rotbonds_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, MAX_NUM_OF_ATOMS*MAX_NUM_OF_ROTBONDS*sizeof(int), &cData.mem_rotbonds_atoms_const);
+	mallocBufferObject(tData.context,CL_MEM_READ_ONLY, MAX_NUM_OF_ROTBONDS*sizeof(int),                  &cData.mem_num_rotating_atoms_per_rotbond_const);
 
 	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,1000*sizeof(float),&cData.mem_angle_const);
 	mallocBufferObject(tData.context,CL_MEM_READ_ONLY,1000*sizeof(float),&cData.mem_dependence_on_theta_const);
@@ -314,8 +297,8 @@ void setup_gpu_for_docking(
 	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_dependence_on_theta_const,    false,  &dependence_on_theta,    1000*sizeof(float));
 	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_dependence_on_rotangle_const, false,  &dependence_on_rotangle, 1000*sizeof(float));
 
-	if(cData.preload_gridsize>0)
-		mallocBufferObject(tData.context,CL_MEM_READ_ONLY,cData.preload_gridsize* (sizeof(float)) * (ATYPE_NUM+2),&(tData.pMem_fgrids));
+	if(cData.preallocated_gridsize>0)
+		mallocBufferObject(tData.context,CL_MEM_READ_ONLY,cData.preallocated_gridsize*sizeof(float),&(tData.pMem_fgrids));
 }
 
 void finish_gpu_from_docking(
@@ -323,6 +306,8 @@ void finish_gpu_from_docking(
                              GpuTempData& tData
                             )
 {
+	if(cData.devnum>-2) return; // device not set up
+	
 	clReleaseMemObject(cData.mem_interintra_const);
 	clReleaseMemObject(cData.mem_intracontrib_const);
 	clReleaseMemObject(cData.mem_intra_const);
@@ -353,11 +338,11 @@ void finish_gpu_from_docking(
 	clReleaseProgram(tData.program);
 	clReleaseCommandQueue(tData.command_queue);
 	clReleaseContext(tData.context);
+	free(tData.device_name);
 }
 
 int docking_with_gpu(
                      const Gridinfo*        mygrid,
-                     /*const*/ float*       cpu_floatgrids,
                            Dockpars*        mypars,
                      const Liganddata*      myligand_init,
                      const Liganddata*      myxrayligand,
@@ -367,21 +352,18 @@ int docking_with_gpu(
                            SimulationState& sim_state,
                            GpuData&         cData,
                            GpuTempData&     tData,
-                           bool             floatgrids_preloaded
+                           std::string*     output
                     )
 /* The function performs the docking algorithm and generates the corresponding result files.
 parameter mygrid:
 		describes the grid
 		filled with get_gridinfo()
-parameter cpu_floatgrids:
-		points to the memory region containing the grids
-		filled with get_gridvalues_f()
 parameter mypars:
 		describes the docking parameters
 		filled with get_commandpars()
 parameter myligand_init:
 		describes the ligands
-		filled with get_liganddata()
+		filled with parse_liganddata()
 parameter myxrayligand:
 		describes the xray ligand
 		filled with get_xrayliganddata()
@@ -389,6 +371,9 @@ parameters argc and argv:
 		are the corresponding command line arguments parameter
 */
 {
+	char* outbuf;
+	if(output!=NULL) outbuf = (char*)malloc(256*sizeof(char));
+
 	// Times
 	cl_ulong time_start_kernel;
 	cl_ulong time_end_kernel;
@@ -412,7 +397,6 @@ parameters argc and argv:
 	unsigned int* cpu_prng_seeds;
 
 	Dockparameters dockpars;
-	size_t size_floatgrids;
 	size_t size_populations;
 	size_t size_energies;
 	size_t size_prng_seeds;
@@ -471,60 +455,38 @@ parameters argc and argv:
 	memset(sim_state.cpu_evals_of_runs.data(), 0, size_evals_of_runs);
 
 	// preparing the constant data fields for the GPU
-	kernelconstant_interintra	KerConst_interintra;
-	kernelconstant_intracontrib	KerConst_intracontrib;
-	kernelconstant_intra		KerConst_intra;
-	kernelconstant_rotlist		KerConst_rotlist;
-	kernelconstant_conform		KerConst_conform;
-	kernelconstant_grads		KerConst_grads;
+	kernelconstant_interintra*	KerConst_interintra = new kernelconstant_interintra;
+	kernelconstant_intracontrib*	KerConst_intracontrib = new kernelconstant_intracontrib;
+	kernelconstant_intra*		KerConst_intra = new kernelconstant_intra;
+	kernelconstant_rotlist*		KerConst_rotlist = new kernelconstant_rotlist;
+	kernelconstant_conform*		KerConst_conform = new kernelconstant_conform;
+	kernelconstant_grads*		KerConst_grads = new kernelconstant_grads;
 
 	if (prepare_const_fields_for_gpu(&myligand_reference, mypars,
-	                                 &KerConst_interintra,
-	                                 &KerConst_intracontrib,
-	                                 &KerConst_intra,
-	                                 &KerConst_rotlist,
-	                                 &KerConst_conform,
-	                                 &KerConst_grads) == 1) {
+	                                 KerConst_interintra,
+	                                 KerConst_intracontrib,
+	                                 KerConst_intra,
+	                                 KerConst_rotlist,
+	                                 KerConst_conform,
+	                                 KerConst_grads) == 1) {
 		return 1;
 	}
 
-	size_t sz_interintra_const   = MAX_NUM_OF_ATOMS*sizeof(float) +
-	                               MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                               MAX_NUM_OF_ATOMS*sizeof(uint32_t) +
-	                               MAX_NUM_OF_ATOMS*sizeof(char);
-
-	size_t sz_intracontrib_const = 2*MAX_INTRAE_CONTRIBUTORS*sizeof(uint32_t);
-
-	size_t sz_intra_const        = MAX_NUM_OF_ATYPES*sizeof(unsigned int) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(unsigned short) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*sizeof(float) +
-	                               MAX_NUM_OF_ATYPES*sizeof(float);
-
-	size_t sz_rotlist_const      = MAX_NUM_OF_ROTATIONS*sizeof(int);
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_interintra_const,   false, KerConst_interintra,   sizeof(kernelconstant_interintra));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_intracontrib_const, false, KerConst_intracontrib, sizeof(kernelconstant_intracontrib));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_intra_const,        false, KerConst_intra,        sizeof(kernelconstant_intra));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotlist_const,      false, KerConst_rotlist,      sizeof(kernelconstant_rotlist));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_conform_const,      false, KerConst_conform,      sizeof(kernelconstant_conform));
 
-	size_t sz_conform_const      = 3*MAX_NUM_OF_ATOMS*sizeof(float) +
-	                               3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                               3*MAX_NUM_OF_ROTBONDS*sizeof(float) +
-	                               4*MAX_NUM_OF_RUNS*sizeof(float);
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotbonds_const,                       false, KerConst_grads->rotbonds,                       sizeof(KerConst_grads->rotbonds));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotbonds_atoms_const,                 false, KerConst_grads->rotbonds_atoms,                 sizeof(KerConst_grads->rotbonds_atoms));
+	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_num_rotating_atoms_per_rotbond_const, false, KerConst_grads->num_rotating_atoms_per_rotbond, sizeof(KerConst_grads->num_rotating_atoms_per_rotbond));
 
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_interintra_const,   false, &KerConst_interintra,   sz_interintra_const);
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_intracontrib_const, false, &KerConst_intracontrib, sz_intracontrib_const);
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_intra_const,        false, &KerConst_intra,        sz_intra_const);
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotlist_const,      false, &KerConst_rotlist,      sz_rotlist_const);
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_conform_const,      false, &KerConst_conform,      sz_conform_const);
-
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotbonds_const,                       false, &KerConst_grads.rotbonds,                       2*MAX_NUM_OF_ROTBONDS*sizeof(int));
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_rotbonds_atoms_const,                 false, &KerConst_grads.rotbonds_atoms,                 MAX_NUM_OF_ATOMS*MAX_NUM_OF_ROTBONDS*sizeof(int));
-	memcopyBufferObjectToDevice(tData.command_queue,cData.mem_num_rotating_atoms_per_rotbond_const, false, &KerConst_grads.num_rotating_atoms_per_rotbond, MAX_NUM_OF_ROTBONDS*sizeof(int));
 
 	// ----------------------------------------------------------------------
 
- 	// allocating GPU memory for populations, floatgirds,
+	// allocating GPU memory for populations, floatgirds,
 	// energies, evaluation counters and random number generator states
-	size_floatgrids = 4 * (sizeof(float)) * (mygrid->num_of_atypes+2) * (mygrid->size_xyz[0]) * (mygrid->size_xyz[1]) * (mygrid->size_xyz[2]);
 
 	cl_mem mem_dockpars_conformations_current;
 	cl_mem mem_dockpars_energies_current;
@@ -534,29 +496,28 @@ parameters argc and argv:
 	cl_mem mem_gpu_evals_of_runs;
 	cl_mem mem_dockpars_prng_states;
 
-	if(cData.preload_gridsize==0)
-		mallocBufferObject(tData.context,CL_MEM_READ_ONLY,size_floatgrids, &tData.pMem_fgrids);
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_populations, &mem_dockpars_conformations_current);
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_energies,    &mem_dockpars_energies_current);
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_populations, &mem_dockpars_conformations_next);
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_energies,    &mem_dockpars_energies_next);
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,mypars->pop_size*mypars->num_of_runs*sizeof(int), &mem_dockpars_evals_of_new_entities);
+	if(cData.preallocated_gridsize==0)
+		mallocBufferObject(tData.context, CL_MEM_READ_ONLY, mygrid->grids.size()*sizeof(float), &tData.pMem_fgrids);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_populations, &mem_dockpars_conformations_current);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_energies,    &mem_dockpars_energies_current);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_populations, &mem_dockpars_conformations_next);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_energies,    &mem_dockpars_energies_next);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, mypars->pop_size*mypars->num_of_runs*sizeof(int), &mem_dockpars_evals_of_new_entities);
 
 	// -------- Replacing with memory maps! ------------
 #if defined (MAPPED_COPY)
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,size_evals_of_runs, &mem_gpu_evals_of_runs);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR , size_evals_of_runs, &mem_gpu_evals_of_runs);
 #else
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_evals_of_runs, &mem_gpu_evals_of_runs);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_evals_of_runs, &mem_gpu_evals_of_runs);
 #endif
 	// -------- Replacing with memory maps! ------------
 
-	mallocBufferObject(tData.context,CL_MEM_READ_WRITE,size_prng_seeds, &mem_dockpars_prng_states);
+	mallocBufferObject(tData.context, CL_MEM_READ_WRITE, size_prng_seeds, &mem_dockpars_prng_states);
 
-	if(!floatgrids_preloaded)
-		memcopyBufferObjectToDevice(tData.command_queue,tData.pMem_fgrids,          false, cpu_floatgrids,                     size_floatgrids);
-	memcopyBufferObjectToDevice(tData.command_queue,mem_dockpars_conformations_current, false, cpu_init_populations,               size_populations);
-	memcopyBufferObjectToDevice(tData.command_queue,mem_gpu_evals_of_runs,              false, sim_state.cpu_evals_of_runs.data(), size_evals_of_runs);
-	memcopyBufferObjectToDevice(tData.command_queue,mem_dockpars_prng_states,           false, cpu_prng_seeds,                     size_prng_seeds);
+	memcopyBufferObjectToDevice(tData.command_queue, tData.pMem_fgrids,                  false, mygrid->grids.data(),               mygrid->grids.size()*sizeof(float));
+	memcopyBufferObjectToDevice(tData.command_queue, mem_dockpars_conformations_current, false, cpu_init_populations,               size_populations);
+	memcopyBufferObjectToDevice(tData.command_queue, mem_gpu_evals_of_runs,              false, sim_state.cpu_evals_of_runs.data(), size_evals_of_runs);
+	memcopyBufferObjectToDevice(tData.command_queue, mem_dockpars_prng_states,           false, cpu_prng_seeds,                     size_prng_seeds);
 
 	// preparing parameter struct
 	dockpars.num_of_atoms      = ((int)  myligand_reference.num_of_atoms);
@@ -625,7 +586,7 @@ parameters argc and argv:
 //			heur_evals = (unsigned long)ceil(1000 * pow(2.0,0.5 * myligand_init->num_of_rotbonds + 6.0));
 			heur_evals = (unsigned long)ceil(64000 * pow(2.0, (0.5 - 0.2 * myligand_init->num_of_rotbonds/(20.0f + myligand_init->num_of_rotbonds)) * myligand_init->num_of_rotbonds));
 			} else{
-				printf("\nError: LS method \"%s\" is not supported by heuristics.\n       Please choose Solis-Wets (sw), Adadelta (ad),\n       or switch off the heuristics.\n",mypars->ls_method);
+				para_printf("\nError: LS method \"%s\" is not supported by heuristics.\n       Please choose Solis-Wets (sw), Adadelta (ad),\n       or switch off the heuristics.\n",mypars->ls_method);
 				exit(-1);
 			}
 		}
@@ -636,9 +597,9 @@ parameters argc and argv:
 		// => e = 1/19*hm
 		// at hm = 50 M => e0 = 2.63 M where e becomes less than 95% (about 11 torsions)
 		mypars->num_of_energy_evals = (unsigned long)ceil(heur_evals*(float)mypars->heuristics_max/(mypars->heuristics_max+heur_evals));
-		printf("    Using heuristics: (capped) number of evaluations set to %lu\n",mypars->num_of_energy_evals);
+		para_printf("    Using heuristics: (capped) number of evaluations set to %lu\n",mypars->num_of_energy_evals);
 		if (mypars->nev_provided && (mypars->num_of_energy_evals>nev)){
-			printf("    Overriding heuristics, setting number of evaluations to --nev = %lu instead.\n",nev);
+			para_printf("    Overriding heuristics, setting number of evaluations to --nev = %lu instead.\n",nev);
 			mypars->num_of_energy_evals = nev;
 			profile.capped = true;
 		}
@@ -648,14 +609,14 @@ parameters argc and argv:
 		float min_frac = a/(1+cap_fraction*cap_fraction*(a/(a-1.0f)-1.0f))+1.0f-a;
 		min_as_evals = (unsigned long)ceil(mypars->num_of_energy_evals*min_frac)*mypars->num_of_runs;
 		if(cap_fraction<0.5f){
-			printf("    Warning: The set number of evals is %.2f%% of the uncapped heuristics estimate of %lu evals.\n",cap_fraction*100.0f,heur_evals);
-			printf("             This means this docking may not be able to converge. Increasing ");
+			para_printf("    Warning: The set number of evals is %.2f%% of the uncapped heuristics estimate of %lu evals.\n",cap_fraction*100.0f,heur_evals);
+			para_printf("             This means this docking may not be able to converge. Increasing ");
 			if (mypars->nev_provided && (mypars->num_of_energy_evals>nev))
-				printf("--nev");
+				para_printf("--nev");
 			else
-				printf("--heurmax");
-			printf(" may improve\n             convergence but will also increase runtime.\n");
-			if(mypars->autostop) printf("             AutoStop will not stop before %.2f%% (%lu) of the set number of evaluations.\n",min_frac*100.0f,min_as_evals/mypars->num_of_runs);
+				para_printf("--heurmax");
+			para_printf(" may improve\n             convergence but will also increase runtime.\n");
+			if(mypars->autostop) para_printf("             AutoStop will not stop before %.2f%% (%lu) of the set number of evaluations.\n",min_frac*100.0f,min_as_evals/mypars->num_of_runs);
 		}
 	}
 	
@@ -673,13 +634,13 @@ parameters argc and argv:
 		strcpy(method_chosen,"ADADELTA (ad)");
 	}
 	else{
-		printf("\nError: LS method %s is not (yet) supported in the OpenCL version.\n",mypars->ls_method);
+		para_printf("\nError: LS method %s is not (yet) supported in the OpenCL version.\n",mypars->ls_method);
 		exit(-1);
 	}
-	printf("    Local-search chosen method is: %s\n", (dockpars.lsearch_rate == 0.0f)? "GA" : method_chosen);
+	para_printf("    Local-search chosen method is: %s\n", (dockpars.lsearch_rate == 0.0f)? "GA" : method_chosen);
 
 	if((mypars->initial_sw_generations>0) && (strcmp(mypars->ls_method, "sw") != 0))
-		printf("    Using Solis-Wets (sw) for the first %d generations.\n",mypars->initial_sw_generations);
+		para_printf("    Using Solis-Wets (sw) for the first %d generations.\n",mypars->initial_sw_generations);
 
         // Get profile for timing
 	profile.adadelta=(strcmp(mypars->ls_method, "ad")==0);
@@ -688,33 +649,18 @@ parameters argc and argv:
 	profile.num_rotbonds = myligand_init->num_of_rotbonds;
 
 	/*
-	printf("dockpars.num_of_intraE_contributors:%u\n", dockpars.num_of_intraE_contributors);
-	printf("dockpars.rotbondlist_length:%u\n", dockpars.rotbondlist_length);
+	para_printf("dockpars.num_of_intraE_contributors:%u\n", dockpars.num_of_intraE_contributors);
+	para_printf("dockpars.rotbondlist_length:%u\n", dockpars.rotbondlist_length);
 	*/
 
 	clock_start_docking = clock();
 
 #ifdef DOCK_DEBUG
-	printf("\n");
+	para_printf("\n");
 	// Main while-loop iterarion counter
 	unsigned int ite_cnt = 0;
 #endif
 
-	/*
-	// Added for printing intracontributor_pairs (autodockdevpy)
-	for (unsigned int intrapair_cnt=0; 
-			  intrapair_cnt<dockpars.num_of_intraE_contributors;
-			  intrapair_cnt++) {
-		if (intrapair_cnt == 0) {
-			printf("%-10s %-10s %-10s\n", "#pair", "#atom1", "#atom2");
-		}
-
-		printf ("%-10u %-10u %-10u\n", intrapair_cnt,
-					    KerConst.intraE_contributors_const[3*intrapair_cnt],
-					    KerConst.intraE_contributors_const[3*intrapair_cnt+1]);
-	}
-	*/
-
 	// Kernel1
 	setKernelArg(tData.kernel1,0, sizeof(dockpars.num_of_atoms),                  &dockpars.num_of_atoms);
 	setKernelArg(tData.kernel1,1, sizeof(dockpars.true_ligand_atoms),             &dockpars.true_ligand_atoms);
@@ -746,7 +692,7 @@ parameters argc and argv:
 	kernel1_gxsize = blocksPerGridForEachEntity * threadsPerBlock;
 	kernel1_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8u %10s %4u\n", "K_INIT", "gSize: ", kernel1_gxsize, "lSize: ", kernel1_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8u %10s %4u\n", "K_INIT", "gSize: ", kernel1_gxsize, "lSize: ", kernel1_lxsize); fflush(stdout);
 #endif
 	// End of Kernel1
 
@@ -757,7 +703,7 @@ parameters argc and argv:
 	kernel2_gxsize = blocksPerGridForEachRun * threadsPerBlock;
 	kernel2_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8u %10s %4u\n", "K_EVAL", "gSize: ", kernel2_gxsize, "lSize: ",  kernel2_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8u %10s %4u\n", "K_EVAL", "gSize: ", kernel2_gxsize, "lSize: ",  kernel2_lxsize); fflush(stdout);
 #endif
 	// End of Kernel2
 
@@ -802,7 +748,7 @@ parameters argc and argv:
 	kernel4_gxsize = blocksPerGridForEachEntity * threadsPerBlock;
 	kernel4_lxsize = threadsPerBlock;
 #ifdef DOCK_DEBUG
-	printf("%-25s %10s %8u %10s %4u\n", "K_GA_GENERATION", "gSize: ",  kernel4_gxsize, "lSize: ", kernel4_lxsize); fflush(stdout);
+	para_printf("%-25s %10s %8u %10s %4u\n", "K_GA_GENERATION", "gSize: ",  kernel4_gxsize, "lSize: ", kernel4_lxsize); fflush(stdout);
 #endif
 	// End of Kernel4
 
@@ -851,7 +797,7 @@ parameters argc and argv:
 			kernel3_gxsize = blocksPerGridForEachLSEntity * threadsPerBlock;
 			kernel3_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_SOLISWETS", "gSize: ", kernel3_gxsize, "lSize: ", kernel3_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_SOLISWETS", "gSize: ", kernel3_gxsize, "lSize: ", kernel3_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel3
 		}
@@ -901,7 +847,7 @@ parameters argc and argv:
 			kernel5_gxsize = blocksPerGridForEachGradMinimizerEntity * threadsPerBlock;
 			kernel5_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_SDESCENT", "gSize: ", kernel5_gxsize, "lSize: ", kernel5_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_SDESCENT", "gSize: ", kernel5_gxsize, "lSize: ", kernel5_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel5
 		}
@@ -949,7 +895,7 @@ parameters argc and argv:
 			kernel6_gxsize = blocksPerGridForEachGradMinimizerEntity * threadsPerBlock;
 			kernel6_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_FIRE", "gSize: ", kernel6_gxsize, "lSize: ", kernel6_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_FIRE", "gSize: ", kernel6_gxsize, "lSize: ", kernel6_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel6
 		}
@@ -998,7 +944,7 @@ parameters argc and argv:
 			kernel7_gxsize = blocksPerGridForEachGradMinimizerEntity * threadsPerBlock;
 			kernel7_lxsize = threadsPerBlock;
 			#ifdef DOCK_DEBUG
-			printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
+			para_printf("%-25s %10s %8u %10s %4u\n", "K_LS_GRAD_ADADELTA", "gSize: ", kernel7_gxsize, "lSize: ", kernel7_lxsize); fflush(stdout);
 			#endif
 			// End of Kernel7
 		}
@@ -1006,22 +952,22 @@ parameters argc and argv:
 
 	// Kernel1
 	#ifdef DOCK_DEBUG
-		printf("\nExecution starts:\n\n");
-		printf("%-25s", "\tK_INIT");fflush(stdout);
+		para_printf("\nExecution starts:\n\n");
+		para_printf("%-25s", "\tK_INIT");fflush(stdout);
 	#endif
 	runKernel1D(tData.command_queue,tData.kernel1,kernel1_gxsize,kernel1_lxsize,&time_start_kernel,&time_end_kernel);
 	#ifdef DOCK_DEBUG
-		printf("%15s" ," ... Finished\n");fflush(stdout);
+		para_printf("%15s" ," ... Finished\n");fflush(stdout);
 	#endif
 	// End of Kernel1
 
 	// Kernel2
 	#ifdef DOCK_DEBUG
-		printf("%-25s", "\tK_EVAL");fflush(stdout);
+		para_printf("%-25s", "\tK_EVAL");fflush(stdout);
 	#endif
 	runKernel1D(tData.command_queue,tData.kernel2,kernel2_gxsize,kernel2_lxsize,&time_start_kernel,&time_end_kernel);
 	#ifdef DOCK_DEBUG
-		printf("%15s" ," ... Finished\n");fflush(stdout);
+		para_printf("%15s" ," ... Finished\n");fflush(stdout);
 	#endif
 	// End of Kernel2
 	// ===============================================================================
@@ -1040,7 +986,7 @@ parameters argc and argv:
 	generation_cnt = 0;
 	unsigned long total_evals;
 	// print progress bar
-	AutoStop autostop(mypars->pop_size, mypars->num_of_runs, mypars->stopstd, mypars->as_frequency);
+	AutoStop autostop(mypars->pop_size, mypars->num_of_runs, mypars->stopstd, mypars->as_frequency, output);
 #ifndef DOCK_DEBUG
 	if (mypars->autostop)
 	{
@@ -1048,9 +994,9 @@ parameters argc and argv:
 	}
 	else
 	{
-		printf("\nExecuting docking runs:\n");
-		printf("        20%%        40%%       60%%       80%%       100%%\n");
-		printf("---------+---------+---------+---------+---------+\n");
+		para_printf("\nExecuting docking runs:\n");
+		para_printf("        20%%        40%%       60%%       80%%       100%%\n");
+		para_printf("---------+---------+---------+---------+---------+\n");
 	}
 #endif
 	curr_progress_cnt = 0;
@@ -1079,7 +1025,7 @@ parameters argc and argv:
 		{
 #ifdef DOCK_DEBUG
 			ite_cnt++;
-			printf("\nLGA iteration # %u\n", ite_cnt);
+			para_printf("\nLGA iteration # %u\n", ite_cnt);
 			fflush(stdout);
 #endif
 			// update progress bar (bar length is 50)
@@ -1089,59 +1035,59 @@ parameters argc and argv:
 			while (curr_progress_cnt < new_progress_cnt) {
 				curr_progress_cnt++;
 #ifndef DOCK_DEBUG
-				printf("*");
+				para_printf("*");
 #endif
 				fflush(stdout);
 			}
 		}
 		// Kernel4
 		#ifdef DOCK_DEBUG
-			printf("%-25s", "\tK_GA_GENERATION");fflush(stdout);
+			para_printf("%-25s", "\tK_GA_GENERATION");fflush(stdout);
 		#endif
 		runKernel1D(tData.command_queue,tData.kernel4,kernel4_gxsize,kernel4_lxsize,&time_start_kernel,&time_end_kernel);
 		#ifdef DOCK_DEBUG
-			printf("%15s", " ... Finished\n");fflush(stdout);
+			para_printf("%15s", " ... Finished\n");fflush(stdout);
 		#endif
 		// End of Kernel4
 		if (dockpars.lsearch_rate != 0.0f) {
 			if ((strcmp(mypars->ls_method, "sw") == 0) || ((strcmp(mypars->ls_method, "ad") == 0) && (generation_cnt<mypars->initial_sw_generations))) {
 				// Kernel3
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_SOLISWETS");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_SOLISWETS");fflush(stdout);
 				#endif
 				runKernel1D(tData.command_queue,tData.kernel3,kernel3_gxsize,kernel3_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel3
 			} else if (strcmp(mypars->ls_method, "sd") == 0) {
 				// Kernel5
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_SDESCENT");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_SDESCENT");fflush(stdout);
 				#endif
 				runKernel1D(tData.command_queue,tData.kernel5,kernel5_gxsize,kernel5_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel5
 			} else if (strcmp(mypars->ls_method, "fire") == 0) {
 				// Kernel6
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_FIRE");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_FIRE");fflush(stdout);
 				#endif
 				runKernel1D(tData.command_queue,tData.kernel6,kernel6_gxsize,kernel6_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel6
 			} else if (strcmp(mypars->ls_method, "ad") == 0) {
 				// Kernel7
 				#ifdef DOCK_DEBUG
-					printf("%-25s", "\tK_LS_GRAD_ADADELTA");fflush(stdout);
+					para_printf("%-25s", "\tK_LS_GRAD_ADADELTA");fflush(stdout);
 				#endif
 				runKernel1D(tData.command_queue,tData.kernel7,kernel7_gxsize,kernel7_lxsize,&time_start_kernel,&time_end_kernel);
 				#ifdef DOCK_DEBUG
-					printf("%15s" ," ... Finished\n");fflush(stdout);
+					para_printf("%15s" ," ... Finished\n");fflush(stdout);
 				#endif
 				// End of Kernel7
 			}
@@ -1153,11 +1099,11 @@ parameters argc and argv:
 		// -------- Replacing with memory maps! ------------
 		// Kernel2
 		#ifdef DOCK_DEBUG
-			printf("%-25s", "\tK_EVAL");fflush(stdout);
+			para_printf("%-25s", "\tK_EVAL");fflush(stdout);
 		#endif
 		runKernel1D(tData.command_queue,tData.kernel2,kernel2_gxsize,kernel2_lxsize,&time_start_kernel,&time_end_kernel);
 		#ifdef DOCK_DEBUG
-			printf("%15s" ," ... Finished\n");fflush(stdout);
+			para_printf("%15s" ," ... Finished\n");fflush(stdout);
 		#endif
 		// End of Kernel2
 		// ===============================================================================
@@ -1236,7 +1182,7 @@ parameters argc and argv:
 		}
 		// ----------------------------------------------------------------------
 		#ifdef DOCK_DEBUG
-			printf("\tProgress %.3f %%\n", progress);
+			para_printf("\tProgress %.3f %%\n", progress);
 			fflush(stdout);
 		#endif
 	} // End of while-loop
@@ -1251,7 +1197,7 @@ parameters argc and argv:
 		//update progress bar (bar length is 50)mem_num_of_rotatingatoms_per_rotbond_const
 		while (curr_progress_cnt < 50) {
 			curr_progress_cnt++;
-			printf("*");
+			para_printf("*");
 			fflush(stdout);
 		}
 	}
@@ -1272,12 +1218,12 @@ parameters argc and argv:
 	// Final autostop statistics output
 	if (mypars->autostop) autostop.output_final_stddev(generation_cnt, sim_state.cpu_energies.data(), total_evals);
 
-	printf("\n");
+	para_printf("\n");
 #if defined (DOCK_DEBUG)
 	for (int cnt_pop=0;cnt_pop<size_populations/sizeof(float);cnt_pop++)
-		printf("total_num_pop: %u, cpu_final_populations[%u]: %f\n",(unsigned int)(size_populations/sizeof(float)),cnt_pop,cpu_final_populations[cnt_pop]);
+		para_printf("total_num_pop: %u, cpu_final_populations[%u]: %f\n",(unsigned int)(size_populations/sizeof(float)),cnt_pop,cpu_final_populations[cnt_pop]);
 	for (int cnt_pop=0;cnt_pop<size_energies/sizeof(float);cnt_pop++)
-		printf("total_num_energies: %u, sim_state.cpu_energies.data()[%u]: %f\n",    (unsigned int)(size_energies/sizeof(float)),cnt_pop,sim_state.cpu_energies.data()[cnt_pop]);
+		para_printf("total_num_energies: %u, sim_state.cpu_energies.data()[%u]: %f\n",    (unsigned int)(size_energies/sizeof(float)),cnt_pop,sim_state.cpu_energies.data()[cnt_pop]);
 #endif
 	// ===============================================================================
 	// Assign simulation results to sim_state
@@ -1286,6 +1232,11 @@ parameters argc and argv:
 	sim_state.sec_per_run = ELAPSEDSECS(clock_stop_docking, clock_start_docking)/mypars->num_of_runs;
 	sim_state.total_evals = total_evals;
 
+#if defined (MAPPED_COPY)
+	unmemMap(tData.command_queue,mem_gpu_evals_of_runs,map_cpu_evals_of_runs);
+#endif
+	clFinish(tData.command_queue);
+
 	clReleaseMemObject(mem_dockpars_conformations_current);
 	clReleaseMemObject(mem_dockpars_energies_current);
 	clReleaseMemObject(mem_dockpars_conformations_next);
@@ -1294,7 +1245,15 @@ parameters argc and argv:
 	clReleaseMemObject(mem_dockpars_prng_states);
 	clReleaseMemObject(mem_gpu_evals_of_runs);
 
+	delete KerConst_interintra;
+	delete KerConst_intracontrib;
+	delete KerConst_intra;
+	delete KerConst_rotlist;
+	delete KerConst_conform;
+	delete KerConst_grads;
+
 	free(cpu_prng_seeds);
+	if(output!=NULL) free(outbuf);
 
 	return 0;
 }
diff --git a/host/src/processgrid.cpp b/host/src/processgrid.cpp
index f1a13411..e29fb712 100644
--- a/host/src/processgrid.cpp
+++ b/host/src/processgrid.cpp
@@ -30,68 +30,57 @@ int get_gridinfo(
                        Gridinfo* mygrid
                 )
 {
-	if(mygrid->info_read) return 0; // already succesfully read this grid's information
+	if(strcmp(fldfilename,mygrid->fld_name.c_str())==0)
+		return 0; // already successfully read this grid's information
 
-	FILE*  fp;
+	if(mygrid->fld_name.size()) // clear grid mapping if information from a previous fld file exists
+		mygrid->grid_mapping.clear();
+
+	std::ifstream fp;
+	std::string line;
 	char   tempstr [256];
 	int    gpoints_even[3];
 	int    recnamelen;
 	double center[3];
+	int grid_types=0;
 
 	// ----------------------------------------------------
 	// Getting full path fo the grid file
 	// Getting father directory name
-	//char* dir = dirname(ts1);
-	//char* filename = basename(ts1);
-
-	char* ts1 = strdup(fldfilename);
-	#ifndef _WIN32
-	mygrid->grid_file_path = strdup(dirname(ts1));
-	#else
-	char drive_tmp[_MAX_DRIVE];
-	char path_tmp[_MAX_DIR];
-	_splitpath(ts1, drive_tmp, path_tmp, NULL, NULL);
-	
-	char result[_MAX_DRIVE+_MAX_DIR];
-	strcpy(result, drive_tmp);
-	strcat(result, path_tmp);
-	mygrid->grid_file_path = strdup(result);
-	#endif
-	free(ts1); // clean up
+	mygrid->grid_file_path = get_filepath(fldfilename);
 	// ----------------------------------------------------
 
 	// Processing fld file
-	fp = fopen(fldfilename, "rb"); // fp = fopen(fldfilename, "r");
-	if (fp == NULL)
+	fp.open(fldfilename);
+	if (fp.fail())
 	{
-		printf("Error: can't open fld file %s!\n", fldfilename);
+		printf("Error: Can't open fld file %s.\n", fldfilename);
 		return 1;
 	}
 
 	const char* ext = strstr(fldfilename,".maps");
 	if(ext){
 		int len=ext-fldfilename;
-		mygrid->map_base_name = (char*)malloc((len+1)*sizeof(char));
-		strncpy(mygrid->map_base_name,fldfilename,len);
-		mygrid->map_base_name[len]='\0';
+		mygrid->map_base_name.assign(fldfilename,len);
 	} else{
-		int len=strlen(fldfilename)+1;
-		mygrid->map_base_name = (char*)malloc(len*sizeof(char));
-		strcpy(mygrid->map_base_name,fldfilename);
+		mygrid->map_base_name = fldfilename;
 	}
-
-	while (fscanf(fp, "%255s", tempstr) != EOF)
+	
+	bool have_e=false;
+	bool have_d=false;
+	while(std::getline(fp, line))
 	{
+		sscanf(line.c_str(),"%255s",tempstr);
 		// -----------------------------------
 		// Reorder according to file *.maps.fld
 		// -----------------------------------
 		// Grid spacing
 		if (strcmp(tempstr, "#SPACING") == 0)
 		{
-			fscanf(fp, "%lf", &(mygrid->spacing));
+			sscanf(&line.c_str()[8], "%lf", &(mygrid->spacing));
 			if (mygrid->spacing > 1)
 			{
-				printf("Error: grid spacing is too big!\n");
+				printf("Error: Grid spacing is larger than 1 Å.\n");
 				return 1;
 			}
 		}
@@ -99,7 +88,7 @@ int get_gridinfo(
 		// capturing number of grid points
 		if (strcmp(tempstr, "#NELEMENTS") == 0)
 		{
-			fscanf(fp, "%d%d%d", &(gpoints_even[0]), &(gpoints_even[1]), &(gpoints_even[2]));
+			sscanf(&line.c_str()[10], "%d %d %d", &(gpoints_even[0]), &(gpoints_even[1]), &(gpoints_even[2]));
 			// plus one gridpoint in each dimension
 			mygrid->size_xyz[0] = gpoints_even[0] + 1;
 			mygrid->size_xyz[1] = gpoints_even[1] + 1;
@@ -108,7 +97,7 @@ int get_gridinfo(
 			// If the grid is too big, send message and change the value of truncated_size_xyz
 			if ((mygrid->size_xyz [0] > MAX_NUM_GRIDPOINTS) || (mygrid->size_xyz [1] > MAX_NUM_GRIDPOINTS) || (mygrid->size_xyz [2] > MAX_NUM_GRIDPOINTS))
 			{
-				printf("Error: each dimension of the grid must be below %i.\n", MAX_NUM_GRIDPOINTS);
+				printf("Error: Each dimension of the grid must be below %i.\n", MAX_NUM_GRIDPOINTS);
 				return 1;
 			}
 		}
@@ -116,139 +105,138 @@ int get_gridinfo(
 		// Capturing center
 		if (strcmp(tempstr, "#CENTER") == 0)
 		{
-			fscanf(fp, "%lf%lf%lf", &(center[0]), &(center[1]), &(center[2]));
+			sscanf(&line.c_str()[7], "%lf %lf %lf", &(center[0]), &(center[1]), &(center[2]));
 		}
 
 		// Name of the receptor and corresponding files
 		if (strcmp(tempstr, "#MACROMOLECULE") == 0)
 		{
-			fscanf(fp, "%255s", tempstr);
+			sscanf(&line.c_str()[14], "%255s", tempstr);
 			recnamelen = strcspn(tempstr,".");
 			tempstr[recnamelen] = '\0';
 			int len = strlen(tempstr)+1;
-			mygrid->receptor_name = (char*)malloc(len*sizeof(char));
-			strcpy(mygrid->receptor_name, tempstr);
+			mygrid->receptor_name = tempstr;
 		}
 
-		// -----------------------------------
-		// MISSING: similar section corresponding to
-		// #GRID_PARAMETER_FILE
-		// -----------------------------------
+		if (line.find("label=") == 0)
+		{
+			sscanf(&line.c_str()[6],"%255s", tempstr);
+			char* typesep = strchr(tempstr,'-'); // <atom type>-affinity ...
+			if(typesep!=NULL){
+				typesep[0]='\0'; // tempstr is now just the atom type
+			} else{
+				tempstr[1]='\0'; // tempstr is now either E(lectrostatics) or D(esolvation)
+				tempstr[0]=tolower(tempstr[0]); // lower-case it
+			}
+			if(tempstr[0]=='e') have_e=true;
+			if(tempstr[0]=='d') have_d=true;
+			mygrid->grid_mapping.push_back(tempstr);
+			grid_types++;
+		}
+		
+		if (strcmp(tempstr, "variable") == 0)
+		{
+			size_t fidx = line.find("file=");
+			if(fidx==std::string::npos){
+				printf("Error: Grid map file names cannot be read.\n");
+				return 1;
+			}
+			sscanf(&line.c_str()[fidx+5],"%255s", tempstr);
+			mygrid->grid_mapping.push_back(tempstr);
+		}
+	}
+	
+	if(mygrid->grid_mapping.size() != 2*grid_types){
+		printf("Error: Number of grid map labels (%d) and filenames (%d) mismatched in fld file.\n", grid_types, mygrid->grid_mapping.size()-grid_types);
+		return 1;
 	}
+	if(!have_e){
+		printf("Error: Grid map does not contain an (e)lectrostatics map.\n");
+		return 1;
+	}
+	if(!have_d){
+		printf("Error: Grid map does not contain a (d)esolvation map.\n");
+		return 1;
+	}
+	mygrid->num_of_map_atypes = grid_types-2; // w/o e and d maps
 
 	// calculating grid size
 	mygrid->size_xyz_angstr[0] = (mygrid->size_xyz[0]-1)*(mygrid->spacing);
 	mygrid->size_xyz_angstr[1] = (mygrid->size_xyz[1]-1)*(mygrid->spacing);
 	mygrid->size_xyz_angstr[2] = (mygrid->size_xyz[2]-1)*(mygrid->spacing);
 
+	if((center[0] + 0.5f * mygrid->size_xyz_angstr[0] > 9999.0f) || (center[0] - 0.5f * mygrid->size_xyz_angstr[0] < -999.0f) ||
+	   (center[1] + 0.5f * mygrid->size_xyz_angstr[1] > 9999.0f) || (center[1] - 0.5f * mygrid->size_xyz_angstr[1] < -999.0f) ||
+	   (center[2] + 0.5f * mygrid->size_xyz_angstr[2] > 9999.0f) || (center[2] - 0.5f * mygrid->size_xyz_angstr[2] < -999.0f)){
+		printf("Error: Grid box needs to be within [-999,9999] Å for each dimension to ensure result ligand coordinates are compatible with the pdbqt format.\n");
+		return 1;
+	}
+
 	// calculating coordinates of origo
 	mygrid->origo_real_xyz[0] = center[0] - (((double) gpoints_even[0])*0.5*(mygrid->spacing));
 	mygrid->origo_real_xyz[1] = center[1] - (((double) gpoints_even[1])*0.5*(mygrid->spacing));
 	mygrid->origo_real_xyz[2] = center[2] - (((double) gpoints_even[2])*0.5*(mygrid->spacing));
 
-	fclose(fp);
-	mygrid->info_read = true;
+	fp.close();
+	mygrid->fld_name = fldfilename;
 
 	return 0;
 }
 
-int get_gridvalues_f(
-                     const Gridinfo* mygrid,
-                           float**   fgrids,
-                           bool      cgmaps
-                    )
-{
-	*fgrids = (float*) malloc(4*(sizeof(float))*(mygrid->num_of_atypes+2)*
-	                                            (mygrid->size_xyz[0])*
-	                                            (mygrid->size_xyz[1])*
-	                                            (mygrid->size_xyz[2]));
-	if (*fgrids == NULL)
-	{
-		printf("Error: not enough memory!\n");
-		return 1;
-	}
-	return get_gridvalues_f(mygrid, *fgrids, cgmaps);
-}
-
-int get_gridvalues_f(
-                     const Gridinfo* mygrid,
-                           float*    fgrids,
-                           bool      cgmaps
-                    )
+int get_gridvalues(Gridinfo* mygrid)
 // The function reads the grid point values from the .map files
 // that correspond to the receptor given by the first parameter.
-// It allocates the proper amount of memory and stores the data there,
-// which can be accessed with the fgrids pointer.
+// It allocates the proper amount of memory and stores the data
+// in mygrid->grids
 // If there are any errors, it returns 1, otherwise
 // the return value is 0.
 {
-	int t, x, y, z;
-	FILE* fp;
-	size_t len = strlen(mygrid->grid_file_path)+strlen(mygrid->receptor_name)+1;
-	if(strlen(mygrid->map_base_name)>len)
-		len = strlen(mygrid->map_base_name);
-	len += 10; // "..map\0" = 6 entries + 4 at most for grid type
-	if(len<128) len=128;
-	char* tempstr = (char*)malloc(len*sizeof(char));
-	float* mypoi;
-
-	mypoi = fgrids;
-
-	for (t=0; t < mygrid->num_of_atypes+2; t++)
+	if(mygrid->grids.size()>0) return 0; // we already read the grid maps
+	mygrid->grids.resize(2*mygrid->grid_mapping.size()*
+	                      (mygrid->size_xyz[0])*
+	                      (mygrid->size_xyz[1])*
+	                      (mygrid->size_xyz[2]));
+	int t, ti, x, y, z;
+	std::ifstream fp;
+	std::string fn, line;
+	float* mypoi = mygrid->grids.data();
+
+	unsigned int g1 = mygrid->size_xyz[0];
+	unsigned int g2 = g1*mygrid->size_xyz[1];
+
+	for (t=0; t < mygrid->grid_mapping.size()/2; t++)
 	{
-		// opening corresponding .map file
-		strcpy(tempstr,mygrid->map_base_name);
-		strcat(tempstr, ".");
-		strcat(tempstr, mygrid->grid_types[t]);
-		strcat(tempstr, ".map");
-		fp = fopen(tempstr, "rb"); // fp = fopen(tempstr, "r");
-		if (fp == NULL){ // try again with the receptor name in the .maps.fld file
-			strcpy(tempstr,mygrid->grid_file_path);
-			strcat(tempstr, "/");
-			strcat(tempstr, mygrid->receptor_name);
-			strcat(tempstr, ".");
-			strcat(tempstr, mygrid->grid_types[t]);
-			strcat(tempstr, ".map");
-			fp = fopen(tempstr, "rb"); // fp = fopen(tempstr, "r");
+		ti = t + mygrid->grid_mapping.size()/2;
+		if(mygrid->fld_relative){ // this is always true (unless changed)
+			fn=mygrid->grid_file_path+"/"+mygrid->grid_mapping[ti];
+//			printf("Atom type %d (%s) uses map: %s\n",t,mygrid->grid_mapping[t].c_str(),fn.c_str());
+			fp.open(fn);
 		}
-		if (fp == NULL)
+		if (fp.fail())
 		{
-			printf("Error: can't open %s!\n", tempstr);
-			if ((strncmp(mygrid->grid_types[t],"CG",2)==0) ||
-			    (strncmp(mygrid->grid_types[t],"G",1)==0))
-			{
-				if(cgmaps)
-					printf("-> Expecting an individual map for each CGx and Gx (x=0..9) atom type.\n");
-				else
-					printf("-> Expecting one map file, ending in .CG.map and .G0.map, for CGx and Gx atom types, respectively.\n");
-			}
+			printf("Error: Can't open grid map %s.\n", fn.c_str());
 			return 1;
 		}
 
 		// seeking to first data
-		do    fscanf(fp, "%127s", tempstr);
-		while (strcmp(tempstr, "CENTER") != 0);
-		fscanf(fp, "%127s", tempstr);
-		fscanf(fp, "%127s", tempstr);
-		fscanf(fp, "%127s", tempstr);
-
-		unsigned int g1 = mygrid->size_xyz[0];
-		unsigned int g2 = g1*mygrid->size_xyz[1];
+		do std::getline(fp, line);
+		while (line.find("CENTER") != 0);
+
 		// reading values
 		for (z=0; z < mygrid->size_xyz[2]; z++)
 			for (y=0; y < mygrid->size_xyz[1]; y++)
 				for (x=0; x < mygrid->size_xyz[0]; x++)
 				{
-					fscanf(fp, "%f", mypoi);
+					std::getline(fp, line); // sscanf(line.c_str(), "%f", mypoi);
+					*mypoi = map2float(line.c_str());
 					// fill in duplicate data for linearized memory access in kernel
 					if(y>0) *(mypoi-4*g1+1) = *mypoi;
 					if(z>0) *(mypoi-4*g2+2) = *mypoi;
 					if(y>0 && z>0) *(mypoi-4*(g2+g1)+3) = *mypoi;
 					mypoi+=4;
 				}
-		fclose(fp);
+		fp.close();
 	}
-	free(tempstr);
 	return 0;
 }
 
diff --git a/host/src/processligand.cpp b/host/src/processligand.cpp
index 43c5a779..9abf5667 100644
--- a/host/src/processligand.cpp
+++ b/host/src/processligand.cpp
@@ -43,16 +43,17 @@ int init_liganddata(
                           Liganddata*  myligand,
                           Gridinfo*    mygrid,
                           int          nr_deriv_atypes,
-                          deriv_atype* deriv_atypes,
-                          bool         cgmaps
+                          deriv_atype* deriv_atypes
                    )
 // The functions first parameter is an empty Liganddata, the second a variable of
 // Gridinfo type. The function fills the num_of_atypes and atom_types fields of
-// myligand according to the num_of_atypes and grid_types fields of mygrid. In
+// myligand according to the num_of_atypes and ligand_grid_types fields of mygrid. In
 // this case it is supposed, that the ligand and receptor described by the two
 // parameters correspond to each other.
 // If the operation was successful, the function returns 0, if not, it returns 1.
 {
+	myligand->file_content.clear();
+	myligand->ligand_line_count = 0;
 	std::ifstream fp;
 	int num_of_atypes, new_type, num_of_base_atypes;
 	char atom_types [MAX_NUM_OF_ATOMS][4];
@@ -76,23 +77,23 @@ int init_liganddata(
 			fp.open(ligfilename);
 		else
 			fp.open(flexresfilename);
+		
 		if (fp.fail())
 		{
 			if(l==0)
-				printf("Error: can't open ligand data file %s!\n", ligfilename);
+				printf("Error: Can't open ligand data file %s.\n", ligfilename);
 			else
-				printf("Error: can't open flexibe residue data file %s!\n", flexresfilename);
+				printf("Error: Can't open flexibe residue data file %s.\n", flexresfilename);
 			return 1;
 		}
 		// reading the whole ligand pdbqt file
 		while(std::getline(fp, line))
 		{
+			myligand->file_content.push_back(line+'\n'); // also stores flexres
 			sscanf(line.c_str(),"%255s",tempstr);
 			if ((strcmp(tempstr, "HETATM") == 0) || (strcmp(tempstr, "ATOM") == 0))
 			{
 				new_type = 1; // supposing this will be a new atom type
-				if ((strcmp(tempstr, "HETATM") == 0)) // seeking to the first coordinate value
-				line[17]='\0';
 				sscanf(&line.c_str()[77], "%3s", tempstr); // reading atom type
 				tempstr[3] = '\0'; //just to be sure strcpy wont fail even if something is wrong with position
 				line[17]='\0';
@@ -111,7 +112,7 @@ int init_liganddata(
 					// checking if atom type number doesn't exceed 14
 					if (num_of_atypes >= MAX_NUM_OF_ATYPES)
 					{
-						printf("Error: too many types of ligand atoms!\n");
+						printf("Error: Too many ligand atom types (more than %d).\n",MAX_NUM_OF_ATYPES);
 						return 1;
 					}
 
@@ -147,21 +148,10 @@ int init_liganddata(
 			}
 		}
 		// copying field to ligand and grid data
+		if(l==0) myligand->ligand_line_count = myligand->file_content.size();
 		myligand->num_of_atypes = num_of_atypes;
-		mygrid->num_of_atypes   = num_of_base_atypes;
-		mygrid->num_of_map_atypes   = num_of_base_atypes;
 		fp.close();
 	}
-#if defined(CG_G0_INFO)
-	if (cgmaps)
-	{
-		printf("Expecting individual maps for CGx and Gx atom types (x=0..9).\n");
-	}
-	else
-	{
-		printf("Using one map file, .CG.map and .G0.map, for CGx and Gx atom types, respectively.\n");
-	}
-#endif
 #ifdef TYPE_INFO
 	printf("Ligand contains %i base types and %i derived types.\n",num_of_base_atypes,num_of_atypes-num_of_base_atypes);
 #endif
@@ -169,27 +159,35 @@ int init_liganddata(
 	{
 		strcpy(myligand->atom_types[i], atom_types[i]);
 		strcpy(myligand->base_atom_types[i], base_atom_types[i]);
-		strcpy(mygrid->grid_types[myligand->base_type_idx[i]], base_atom_types[i]);
-		if(strncmp(base_atom_types[i],"CG",2)+strncmp(base_atom_types[i],"G",1)==0){
-			memcpy(mygrid->grid_types[myligand->base_type_idx[i]], base_atom_types[i],2*sizeof(char));
-			mygrid->grid_types[myligand->base_type_idx[i]][2] = '\0'; // make sure CG0..9 results in CG
-			if (isdigit(mygrid->grid_types[myligand->base_type_idx[i]][1])) // make sure G0..9 results in G0
-				mygrid->grid_types[myligand->base_type_idx[i]][1] = '0';
+		strcpy(myligand->ligand_grid_types[myligand->base_type_idx[i]], base_atom_types[i]);
+		if((strncmp(atom_types[i],"CG",2)==0) || (strncmp(atom_types[i],"G",1)==0)){ // CGx and Gx can have derived types but purely to determine which map to use (so only ligand_grid_types should be set to base_type)
+			memcpy(myligand->ligand_grid_types[myligand->base_type_idx[i]], base_atom_types[i],2*sizeof(char)); // only base type name is copied
+			if(strcmp(atom_types[i],base_atom_types[i])!=0){ // derived CGx/Gx type exists
+				strcpy(myligand->base_atom_types[i], atom_types[i]);
+			} else{
+				if(strncmp(atom_types[i],"CG",2)==0){ // if no derived type CG type exists, use C map by default
+					myligand->ligand_grid_types[myligand->base_type_idx[i]][1] = '\0'; // CG was already in there
+				}
+			}
+			myligand->ligand_grid_types[myligand->base_type_idx[i]][2] = '\0'; // make sure CG0..9 results in CG
+			if (isdigit(myligand->ligand_grid_types[myligand->base_type_idx[i]][1])) // make sure G0..9 results in G0
+				myligand->ligand_grid_types[myligand->base_type_idx[i]][1] = '0';
 		}
 #ifdef TYPE_INFO
-		printf("Atom type %i -> %s -> %s (grid type %i)\n",i,myligand->atom_types[i],mygrid->grid_types[myligand->base_type_idx[i]],myligand->base_type_idx[i]);
+		printf("Atom type %i -> %s -> %s (grid type %i)\n",i,myligand->atom_types[i],myligand->ligand_grid_types[myligand->base_type_idx[i]],myligand->base_type_idx[i]);
 #endif
 	}
 
 	// adding the two other grid types to mygrid
-	strcpy(mygrid->grid_types[num_of_base_atypes],   "e");
-	strcpy(mygrid->grid_types[num_of_base_atypes+1], "d");
+	strcpy(myligand->ligand_grid_types[num_of_base_atypes],   "e");
+	strcpy(myligand->ligand_grid_types[num_of_base_atypes+1], "d");
 
 	return 0;
 }
 
 int set_liganddata_typeid(
                                 Liganddata* myligand,
+                                Gridinfo*   mygrid,
                                 int         atom_id,
                           const char*       typeof_new_atom
                          )
@@ -213,12 +211,27 @@ int set_liganddata_typeid(
 	if (type < myligand->num_of_atypes)
 	{
 		myligand->atom_idxyzq[atom_id][0] = type;
-		myligand->atom_map_to_fgrids[atom_id] = myligand->base_type_idx[type];
+		myligand->atom_map_to_fgrids[atom_id] = -1;
+		for (i=0; i<(mygrid->grid_mapping.size()/2-2); i++){
+			if(strcmp(mygrid->grid_mapping[i].c_str(),myligand->ligand_grid_types[myligand->base_type_idx[type]]) == 0){
+				myligand->atom_map_to_fgrids[atom_id]=i; // found
+				break;
+			}
+		}
+		if(myligand->atom_map_to_fgrids[atom_id]<0){
+			// raise map error unless base G-map is specified (which ignores map by default)
+			if(strncmp(myligand->ligand_grid_types[myligand->base_type_idx[type]],"G",1)!=0){
+				printf("Error: No map file specified for atom type %s in fld and no derived type (--derivtype, -T) either.\n",myligand->ligand_grid_types[myligand->base_type_idx[type]]);
+				if (strncmp(myligand->ligand_grid_types[myligand->base_type_idx[type]],"CG",2)==0)
+					printf("       Expecting a derived type for each CGx (x=0..9) atom type (i.e. --derivtype CG0,CG1=C).\n");
+				return 1;
+			}
+		}
 		return 0;
 	}
 	else // if typeof_new_atom hasn't been found
 	{
-		printf("Error: no grid for ligand atom type %s!\n", typeof_new_atom);
+		printf("Error: No grid map for ligand atom type %s.\n", typeof_new_atom);
 		return 1;
 	}
 }
@@ -650,7 +663,9 @@ int get_bonds(Liganddata* myligand)
 
 			if ((atom_nameid1 == ATYPE_GETBONDS) || (atom_nameid2 == ATYPE_GETBONDS))
 			{
-				printf("Error: Ligand includes atom with unknown type: %s or %s!\n", myligand->base_atom_types[atom_typeid1], myligand->base_atom_types[atom_typeid2]);
+				if(atom_nameid1+atom_nameid2==2*ATYPE_GETBONDS){
+					printf("Error: Ligand includes atom with unknown types: %s and %s.\n", myligand->base_atom_types[atom_typeid1], myligand->base_atom_types[atom_typeid2]);
+				} else printf("Error: Ligand includes atom with unknown type: %s.\n", (atom_nameid1==ATYPE_GETBONDS) ? myligand->base_atom_types[atom_typeid1] : myligand->base_atom_types[atom_typeid2]);
 				return 1;
 			}
 
@@ -920,13 +935,13 @@ int get_VWpars(
 
 			if (VWid_atype1 == MAX_NUM_OF_ATYPES)
 			{
-				printf("Error: Ligand includes atom with unknown type 1: %s!\n", myligand->atom_types [atom_typeid1]);
+				printf("Error: Ligand includes atom with unknown type 1: %s.\n", myligand->atom_types [atom_typeid1]);
 				return 1;
 			}
 
 			if  (VWid_atype2 == MAX_NUM_OF_ATYPES)
 			{
-				printf("Error: Ligand includes atom with unknown type 2: %s!\n", myligand->atom_types [atom_typeid2]);
+				printf("Error: Ligand includes atom with unknown type 2: %s.\n", myligand->atom_types [atom_typeid2]);
 				return 1;
 			}
 
@@ -1063,7 +1078,7 @@ int get_moving_and_unit_vectors(Liganddata* myligand)
 		dist = distance(pointA, pointB);
 
 		if (dist==0.0){
-			printf("Error: Two atoms have the same XYZ coordinates!\n");
+			printf("Error: Atoms #%d and #%d have the same XYZ coordinates.\n",atom_id_pointA+1,atom_id_pointB+1);
                 	return 1;
 		}
 
@@ -1089,25 +1104,26 @@ int get_moving_and_unit_vectors(Liganddata* myligand)
 	return 0;
 }
 
-int get_liganddata(
-                   const char*        ligfilename,
-                   const char*        flexresfilename,
-                         Liganddata*  myligand,
-                   const double       AD4_coeff_vdW,
-                   const double       AD4_coeff_hb,
-                         int          nr_deriv_atypes,
-                         deriv_atype* deriv_atypes,
-                         int          nr_mod_atype_pairs,
-                         pair_mod*    mod_atype_pairs
-                  )
+int parse_liganddata(
+                           Liganddata*  myligand,
+                           Gridinfo*    mygrid,
+                     const double       AD4_coeff_vdW,
+                     const double       AD4_coeff_hb,
+                           int          nr_deriv_atypes,
+                           deriv_atype* deriv_atypes,
+                           int          nr_mod_atype_pairs,
+                           pair_mod*    mod_atype_pairs
+                    )
 // The functions second parameter is a Liganddata variable whose num_of_atypes
 // and atom_types fields must contain valid data.
-// The function opens the file ligfilename, which is supposed to be an AutoDock4 pdbqt file,
-// and fills the other fields of myligand according to the content of the file.
-// If the operation was successful, the function returns 0, if not, it returns 1.
+// The function parses the ligand and flexres file contents, which are supposed
+// to be AutoDock4 pdbqt files, and fills the other fields of myligand according
+// to the content of the file. If the operation was successful, the function
+// returns 0, if not, it returns 1.
 {
-	FILE* fp;
-	fpos_t fp_start;
+	int line_count=0;
+	std::string line;
+	int fp_start;
 	char tempstr [256];
 	int atom_counter;
 	int delta_count = 0;
@@ -1125,52 +1141,42 @@ int get_liganddata(
 	reserved_highest_rigid_struct_id = 1;
 
 	atom_counter = 0;
-	unsigned int lnr=1;
-	if ( flexresfilename!=NULL ) {
-		if ( strlen(flexresfilename)>0 )
-			lnr++;
-	}
+	unsigned int lnr=1+(myligand->ligand_line_count<myligand->file_content.size());
+	
+	int endline = myligand->ligand_line_count;
 	for (unsigned int l=0; l<lnr; l++)
 	{
-		if(l==0)
-			fp = fopen(ligfilename, "rb"); // fp = fopen(ligfilename, "r");
-		else
-			fp = fopen(flexresfilename, "rb"); // fp = fopen(ligfilename, "r");
-		if (fp == NULL)
-		{
-			if(l==0)
-				printf("Error: can't open ligand data file %s!\n", ligfilename);
-			else
-				printf("Error: can't open flexible residue data file %s!\n", flexresfilename);
-			return 1;
-		}
-		fgetpos (fp, &fp_start);
-	
+		if(l>0) endline =  myligand->file_content.size();
+		fp_start = line_count;
+		
 		// reading atomic coordinates, charges and atom types, and writing
 		// data to myligand->atom_idxyzq
-		while (fscanf(fp, "%255s", tempstr) != EOF)
+		while (line_count < endline)
 		{
+			line = myligand->file_content[line_count];
+			line_count++;
+			sscanf(line.c_str(),"%255s",tempstr);
 			if ((strcmp(tempstr, "HETATM") == 0) || (strcmp(tempstr, "ATOM") == 0))
 			{
 				if (atom_counter > MAX_NUM_OF_ATOMS-1)
 				{
-					printf("Error: ligand consists of too many atoms'\n");
-					printf("Maximal allowed number of atoms is %d!\n", MAX_NUM_OF_ATOMS);
+					printf("Error: System consists of too many atoms.'\n");
+					printf("       Maximum number of atoms is %d.\n", MAX_NUM_OF_ATOMS);
 					return 1;
 				}
-				if ((strcmp(tempstr, "HETATM") == 0)) // seeking to the first coordinate value
-					fseek(fp, 25, SEEK_CUR);
-				else
-					fseek(fp, 27, SEEK_CUR);
-				fscanf(fp, "%lf", &(myligand->atom_idxyzq [atom_counter][1]));
-				fscanf(fp, "%lf", &(myligand->atom_idxyzq [atom_counter][2]));
-				fscanf(fp, "%lf", &(myligand->atom_idxyzq [atom_counter][3]));
-				fscanf(fp, "%255s", tempstr); // skipping the next two fields
-				fscanf(fp, "%255s", tempstr);
-				fscanf(fp, "%lf", &(myligand->atom_idxyzq [atom_counter][4])); // reading charge
-				fscanf(fp, "%4s", tempstr); // reading atom type
-				if (set_liganddata_typeid(myligand, atom_counter, tempstr) != 0) // the function sets the type index
+				line.insert(38,1,' '); // add spaces to make reading coordinates easier
+				line.insert(47,1,' ');
+				sscanf(&line.c_str()[30], "%lf %lf %lf", &(myligand->atom_idxyzq [atom_counter][1]), &(myligand->atom_idxyzq [atom_counter][2]), &(myligand->atom_idxyzq [atom_counter][3]));
+				// the last two are shifted by two chars (the two spaces we added above)
+				sscanf(&line.c_str()[72], "%lf", &(myligand->atom_idxyzq [atom_counter][4])); // reading charge
+				sscanf(&line.c_str()[79], "%3s", tempstr); // reading atom type
+				tempstr[3]='\0';
+				if (set_liganddata_typeid(myligand, mygrid, atom_counter, tempstr) != 0) // the function sets the type index
 					return 1;
+				if(tempstr[0]=='G'){ // G-type are ignored for inter calc unless there is a map specified (checked above)
+					if(myligand->atom_idxyzq[atom_counter][0]==myligand->base_type_idx[(int)myligand->atom_idxyzq[atom_counter][0]])
+						myligand->ignore_inter[atom_counter] = true;
+				}
 				atom_counter++;
 			}
 		}
@@ -1179,26 +1185,38 @@ int get_liganddata(
 		if(l==0){
 			myligand->true_ligand_atoms = atom_counter;
 			atom_counter = 0; // this looks wrong but is correct as it's increment below again (like above)
-			branch_start=0;
+			branch_start = 0;
 		} else{ // example counts 4 - 3 - 6 (lig - flex res - flex res)
 			unsigned int tmp = delta_count; // l=1: = 0 ; l=2: = 3
-			delta_count = atom_counter - myligand->true_ligand_atoms; // l=1: 7 - 4 = 3 ; l=2: 13 - 4 = 9
-			atom_counter -= delta_count - tmp; // l=1: = 7 - (3-0) = 4 ; l=2: 13 - (9-3) = 7
+			delta_count    = atom_counter - myligand->true_ligand_atoms; // l=1: 7 - 4 = 3 ; l=2: 13 - 4 = 9
+			atom_counter  -= delta_count - tmp; // l=1: = 7 - (3-0) = 4 ; l=2: 13 - (9-3) = 7
 			atom_rot_start = atom_counter;
 			branch_start=branch_counter;
 		}
 		
-		fsetpos (fp, &fp_start);
+		line_count = fp_start;
 		unsigned int flex_root = atom_rot_start; // takes care of multiple flexible residues in the same file
+		int atom_count_offset = 1; // counting usually starts at 1
+		bool first_atom = true;
 		
 		// reading data for rotbonds and atom_rotbonds fields
-		while (fscanf(fp, "%255s", tempstr) != EOF)
+		while (line_count < endline)
 		{
+			line = myligand->file_content[line_count];
+			line_count++;
+			sscanf(line.c_str(),"%255s",tempstr);
 			if ((l>0) && (strcmp(tempstr, "ROOT") == 0)){
 				flex_root = atom_counter;
+				atom_rot_start = atom_counter;
+				branch_start = branch_counter;
+				first_atom = true;
 			}
 			if ((strcmp(tempstr, "HETATM") == 0) || (strcmp(tempstr, "ATOM") == 0)) // if new atom, looking for open rotatable bonds
 			{
+				if (first_atom){
+					sscanf(&line.c_str()[6], "%d", &atom_count_offset); // reading first atom index
+					first_atom = false;
+				}
 				for (i=branch_start; i<branch_counter; i++) // for all branches found until now
 					if (branches [i][2] == 1) // if it is open, the atom has to be rotated
 						atom_rotbonds_temp [atom_counter][i] = 1; // modifying atom_rotbonds_temp
@@ -1218,17 +1236,14 @@ int get_liganddata(
 				if (branch_counter >= MAX_NUM_OF_ROTBONDS)
 				{
 					if(l==0)
-						printf("Error: ligand includes too many rotatable bonds.\n");
+						printf("Error: Ligand includes too many rotatable bonds (more than %d).\n",MAX_NUM_OF_ROTBONDS);
 					else
-						printf("Error: ligand and flexible residue include too many rotatable bonds.\n");
-					printf("Maximal allowed number is %d.\n", MAX_NUM_OF_ROTBONDS);
-					fclose(fp);
+						printf("Error: Ligand and flexible residue(s) include too many rotatable bonds (more than %d).\n",MAX_NUM_OF_ROTBONDS);
 					return 1;
 				}
-				fscanf(fp, "%d", &(branches [branch_counter][0]));
-				fscanf(fp, "%d", &(branches [branch_counter][1]));
-				branches [branch_counter][0] += atom_rot_start-1; // atom IDs start from 0 instead of 1
-				branches [branch_counter][1] += atom_rot_start-1;
+				sscanf(&line.c_str()[6], "%d %d", &(branches [branch_counter][0]), &(branches [branch_counter][1]));
+				branches [branch_counter][0] += atom_rot_start-atom_count_offset; // atom IDs start from 0 instead of 1
+				branches [branch_counter][1] += atom_rot_start-atom_count_offset;
 	
 				branches [branch_counter][2] = 1; // 1 means the branch is open, atoms will be rotated
 	
@@ -1240,10 +1255,9 @@ int get_liganddata(
 	
 			if (strcmp(tempstr, "ENDBRANCH") == 0)
 			{
-				fscanf(fp, "%d", &(myligand->rotbonds [endbranch_counter][0])); // rotatable bonds have to be stored in the order
-				fscanf(fp, "%d", &(myligand->rotbonds [endbranch_counter][1])); // of endbranches
-				myligand->rotbonds [endbranch_counter][0] += atom_rot_start-1;
-				myligand->rotbonds [endbranch_counter][1] += atom_rot_start-1;
+				sscanf(&line.c_str()[9], "%d %d", &(myligand->rotbonds [endbranch_counter][0]), &(myligand->rotbonds [endbranch_counter][1])); // rotatable bonds have to be stored in the order of endbranches
+				myligand->rotbonds [endbranch_counter][0] += atom_rot_start-atom_count_offset;
+				myligand->rotbonds [endbranch_counter][1] += atom_rot_start-atom_count_offset;
 				for (i=branch_start; i<branch_counter; i++) // the branch have to be closed
 					if ((branches [i][0] == myligand->rotbonds [endbranch_counter][0]) &&
 					    (branches [i][1] == myligand->rotbonds [endbranch_counter][1]))
@@ -1254,7 +1268,6 @@ int get_liganddata(
 		}
 		reserved_highest_rigid_struct_id++;
 		current_rigid_struct_id=reserved_highest_rigid_struct_id;
-		fclose(fp);
 		myligand->num_of_rotbonds = branch_counter;
 		if (l==0)
 			myligand->true_ligand_rotbonds = branch_counter;
@@ -1310,14 +1323,14 @@ int gen_new_pdbfile(
 	fp_old = fopen(oldpdb, "rb"); // fp_old = fopen(oldpdb, "r");
 	if (fp_old == NULL)
 	{
-		printf("Error: can't open old pdb file %s!\n", oldpdb);
+		printf("Error: Can't open file %s.\n", oldpdb);
 		return 1;
 	}
 
 	fp_new = fopen(newpdb, "w");
 	if (fp_new == NULL)
 	{
-		printf("Error: can't create new pdb file %s!\n", newpdb);
+		printf("Error: Can't create file %s.\n", newpdb);
 		fclose(fp_old);
 		return 1;
 	}
@@ -1329,7 +1342,7 @@ int gen_new_pdbfile(
 		{
 			if (acnt_oldlig >= myligand->num_of_atoms)
 			{
-				printf("Error: ligand in old pdb file includes more atoms than new one.\n");
+				printf("Error: Ligand in file %s includes more atoms than current ligand.\n",oldpdb);
 				fclose(fp_old);
 				fclose(fp_new);
 				return 1;
@@ -1430,16 +1443,17 @@ void scale_ligand(
 
 	for (i=0; i < myligand->num_of_atoms; i++){
 		for (j=1; j<4; j++)
-			myligand->atom_idxyzq [i][j] = myligand->atom_idxyzq [i][j]*scale_factor;
+			myligand->atom_idxyzq [i][j] *= scale_factor;
 //		if(i>=myligand->true_ligand_atoms)
 //			printf("%i: (%f, %f, %f)\n",i-myligand->true_ligand_atoms+1,myligand->atom_idxyzq [i][1],myligand->atom_idxyzq [i][2],myligand->atom_idxyzq [i][3]);
 	}
 }
 
 double calc_rmsd(
-                 const Liganddata* myligand_ref,
-                 const Liganddata* myligand,
-                 const bool        handle_symmetry
+                 const double       atom_idxyzq_ref [MAX_NUM_OF_ATOMS][5],
+                 const double       atom_idxyzq     [MAX_NUM_OF_ATOMS][5],
+                       unsigned int num_atoms,
+                 const bool         handle_symmetry
                 )
 // The function calculates the RMSD value (root mean square deviation of the
 // atomic distances for two conformations of the same ligand) and returns it.
@@ -1454,37 +1468,35 @@ double calc_rmsd(
 	double sumdist2;
 	double mindist2;
 
-	if (myligand_ref->true_ligand_atoms != myligand->true_ligand_atoms)
-	{
-		printf("Warning: RMSD can't be calculated, atom number mismatch %d (ref) vs. %d!\n",myligand_ref->true_ligand_atoms,myligand->true_ligand_atoms);
-		return 100000; // returning unreasonable value
-	}
-
 	sumdist2 = 0;
 
 	if (!handle_symmetry)
 	{
-		for (i=0; i<myligand->true_ligand_atoms; i++)
+		for (i=0; i<num_atoms; i++)
 		{
-			sumdist2 += pow(distance(&(myligand->atom_idxyzq [i][1]), &(myligand_ref->atom_idxyzq [i][1])), 2);
+			double d2 = distance2(&(atom_idxyzq [i][1]), &(atom_idxyzq_ref [i][1])); // coordinates start at [1]
+			sumdist2 += d2;
 		}
 	}
 	else // handling symmetry with the silly AutoDock method
 	{
-		for (i=0; i<myligand->true_ligand_atoms; i++)
+		for (i=0; i<num_atoms; i++)
 		{
 			mindist2 = 100000; // initial value should be high enough so that it is ensured that lower distances will be found
-			for (j=0; j<myligand_ref->num_of_atoms; j++) // looking for the closest atom with same type from the reference
+			for (j=0; j<num_atoms; j++) // looking for the closest atom with same type from the reference
 			{
-				if (myligand->atom_idxyzq [i][0] == myligand_ref->atom_idxyzq [j][0])
-					if (pow(distance(&(myligand->atom_idxyzq [i][1]), &(myligand_ref->atom_idxyzq [j][1])), 2) < mindist2)
-						mindist2 = pow(distance(&(myligand->atom_idxyzq [i][1]), &(myligand_ref->atom_idxyzq [j][1])), 2);
+				if (atom_idxyzq [i][0] == atom_idxyzq [j][0]){ // for same type:
+					double d2 = distance2(&(atom_idxyzq [i][1]), &(atom_idxyzq_ref [j][1]));
+					if (d2 < mindist2){
+						mindist2 = d2;
+					}
+				}
 			}
 			sumdist2 += mindist2;
 		}
 	}
 
-	return (sqrt(sumdist2/myligand->true_ligand_atoms));
+	return (sqrt(sumdist2/num_atoms));
 }
 
 double calc_ddd_Mehler_Solmajer(double distance)
@@ -1536,39 +1548,6 @@ bool is_H_bond(
 		return false;
 }
 
-void print_ref_lig_energies_f(
-                                    Liganddata myligand,
-                              const float      smooth,
-                                    Gridinfo   mygrid,
-                              const float*     fgrids,
-                              const float      scaled_AD4_coeff_elec,
-                              const float      elec_min_distance,
-                              const float      AD4_coeff_desolv,
-                              const float      qasp,
-                                    int        nr_mod_atype_pairs,
-                                    pair_mod*  mod_atype_pairs
-                            )
-// The function calculates the energies of the ligand given in the first parameter,
-// and prints them to the screen.
-{
-	double temp_vec [3];
-	float tmp;
-	int i;
-
-	IntraTables tables(&myligand, scaled_AD4_coeff_elec, AD4_coeff_desolv, qasp);
-	printf("Intramolecular energy of reference ligand: %lf\n",
-	       calc_intraE_f(&myligand, 8, smooth, 0, elec_min_distance, tables, 0, tmp, nr_mod_atype_pairs, mod_atype_pairs));
-
-	for (i=0; i<3; i++)
-		temp_vec [i] = -1*mygrid.origo_real_xyz [i];
-
-	move_ligand(&myligand, temp_vec);
-	scale_ligand(&myligand, (double) 1.0/mygrid.spacing);
-
-	printf("Intermolecular energy of reference ligand: %lf\n",
-	       calc_interE_f(&mygrid, &myligand, fgrids, 0, 0, tmp));
-}
-
 //////////////////////////////////
 //float functions
 
@@ -1734,10 +1713,8 @@ std::vector<AnalysisData> analyze_ligand_receptor(
 	const unsigned int* receptor_list;
 	AnalysisData datum;
 
-	for (atom_cnt=0; atom_cnt<myligand->true_ligand_atoms; atom_cnt++) // for each atom
+	for (atom_cnt=0; atom_cnt<myligand->true_ligand_atoms; atom_cnt++) // for each ligand atom
 	{
-		if (myligand->ignore_inter[atom_cnt])
-			continue;
 		atomtypeid = myligand->base_type_idx[(int)myligand->atom_idxyzq [atom_cnt][0]];
 		x = myligand->atom_idxyzq [atom_cnt][1];
 		y = myligand->atom_idxyzq [atom_cnt][2];
@@ -1788,10 +1765,9 @@ std::vector<AnalysisData> analyze_ligand_receptor(
 		for(unsigned int rid=1; rid<=receptor_list[0]; rid++)
 		{
 			const ReceptorAtom* curr = &receptor_atoms[receptor_list[rid]];
-			double dist2 = (curr->x-x)*(curr->x-x)+(curr->y-y)*(curr->y-y)+(curr->z-z)*(curr->z-z);
 			if((myligand->acceptor[atom_cnt] && curr->donor) ||
 			   (myligand->donor[atom_cnt] && curr->acceptor)){
-				if(dist2 <= H_cutoff){
+				if((curr->x-x)*(curr->x-x)+(curr->y-y)*(curr->y-y)+(curr->z-z)*(curr->z-z) <= H_cutoff){
 					datum.type     = 1; // 0 .. reactive, 1 .. hydrogen bond, 2 .. vdW
 					datum.lig_id   = atom_cnt+1;
 					datum.lig_name = myligand->atom_names[atom_cnt];
@@ -1806,7 +1782,7 @@ std::vector<AnalysisData> analyze_ligand_receptor(
 				if((myligand->base_atom_types[atomtypeid][0]!='H') && (curr->atom_type[0]!='H') && // exclude Hydrogens,
 				   !myligand->acceptor[atom_cnt] && !myligand->donor[atom_cnt] &&                  // non-H-bond capable atoms on ligand
 				   !curr->acceptor && !curr->donor){                                               // ... and receptor
-					if(dist2 <= V_cutoff){
+					if((curr->x-x)*(curr->x-x)+(curr->y-y)*(curr->y-y)+(curr->z-z)*(curr->z-z) <= V_cutoff){
 						datum.type     = 2; // 0 .. reactive, 1 .. hydrogen bond, 2 .. vdW
 						datum.lig_id   = atom_cnt+1;
 						datum.lig_name = myligand->atom_names[atom_cnt];
@@ -1828,52 +1804,70 @@ std::vector<AnalysisData> analyze_ligand_receptor(
 float calc_interE_f(
                     const Gridinfo*   mygrid,
                     const Liganddata* myligand,
-                    const float*      fgrids,
                           float       outofgrid_tolerance,
                           int         debug,
-                          float&      intraflexE
+                          float&      intraflexE,
+                          float*      elecE,
+                          float*      peratom_vdw,
+                          float*      peratom_elec
                    )
 // The function calculates the intermolecular energy of a ligand (given by myligand parameter),
-// and a receptor (represented as a grid). The grid point values must be stored at the location
-// which starts at fgrids, the memory content can be generated with get_gridvalues funciton.
-// The mygrid parameter must be the corresponding grid informtaion. If an atom is outside the
-// grid, the coordinates will be changed with the value of outofgrid_tolerance, if it remains
-// outside, a very high value will be added to the current energy as a penality. If the fifth
-// parameter is one, debug messages will be printed to the screen during calculation.
+// and a receptor (represented as a grid). The grid point values must be stored in mygrid->grids
+// with get_gridvalues function.
+// If an atom is outside the grid, coordinates will be changed by at most the value of
+// outofgrid_tolerance, if it remains outside, a very large value will be added to the current
+// energy as a penality.
 {
 	float interE;
 	int atom_cnt;
 	float x, y, z;
-	int atomtypeid;
+	int atom_typeid;
 	int x_low, x_high, y_low, y_high, z_low, z_high;
 	float q, x_frac, y_frac, z_frac;
-	float cube [2][2][2];
-	float weights [2][2][2];
 	float dx, dy, dz;
 
-	float val;
+	float v, e, val;
 	interE = 0;
 	intraflexE = 0;
+	bool peratom = false;
+	if(elecE != NULL){
+		peratom = true;
+		*elecE = 0;
+	}
+
+	unsigned int g1 = mygrid->size_xyz[0];
+	unsigned int g2 = g1*mygrid->size_xyz[1];
+	unsigned int g3_4 = g2*mygrid->size_xyz[2]<<2; // g3 multiplied by 4
+
+	float weights[8];
+	float cube[8];
+	unsigned long mul_tmp;
 
-	for (atom_cnt=myligand->num_of_atoms-1; atom_cnt>=0; atom_cnt--) // for each atom
+	for (atom_cnt=0; atom_cnt<myligand->num_of_atoms; atom_cnt++) // for each atom
 	{
 		val = 0.0;
-		if (myligand->ignore_inter[atom_cnt])
+		if (myligand->ignore_inter[atom_cnt]){
+			if(peratom){
+				peratom_vdw[atom_cnt] = 0;
+				peratom_elec[atom_cnt] = 0;
+			}
 			continue;
-		atomtypeid = myligand->base_type_idx[(int)myligand->atom_idxyzq [atom_cnt][0]];
+		}
+		mul_tmp = myligand->atom_map_to_fgrids[atom_cnt] * g3_4;
 		x = myligand->atom_idxyzq [atom_cnt][1];
 		y = myligand->atom_idxyzq [atom_cnt][2];
 		z = myligand->atom_idxyzq [atom_cnt][3];
 		q = myligand->atom_idxyzq [atom_cnt][4];
 
-		if ((x < 0) || (x >= mygrid->size_xyz [0]-1) || (y < 0) || (y >= mygrid->size_xyz [1]-1) ||
-			(z < 0) || (z >= mygrid->size_xyz [2]-1)) // if the atom is outside of the grid
+		if ((x < 0) || (x >= mygrid->size_xyz [0]-1) ||
+		    (y < 0) || (y >= mygrid->size_xyz [1]-1) ||
+		    (z < 0) || (z >= mygrid->size_xyz [2]-1)) // if the atom is outside of the grid
 		{
 			if (debug == 1)
 			{
 				printf("\n\nPartial results for atom with id %d:\n", atom_cnt);
 				printf("Atom out of grid: ");
-				printf("x= %lf, y = %lf, z = %lf\n", x, y, z);
+				printf("x= %f, y = %f, z = %f\n", x, y, z);
 			}
 
 			if (outofgrid_tolerance != 0) // if tolerance is set, try to place atom back into the grid
@@ -1903,358 +1897,169 @@ float calc_interE_f(
 					interE += val;
 				else
 					intraflexE += val;
+				if(peratom){
+					peratom_vdw[atom_cnt] = 100000;
+					peratom_elec[atom_cnt] = 100000;
+				}
 				continue;
 			}
 
 			if (debug == 1)
 			{
 				printf("\n\nAtom was placed back into the grid according to the tolerance value %f:\n", outofgrid_tolerance);
-				printf("x= %lf, y = %lf, z = %lf\n", x, y, z);
+				printf("x= %f, y = %f, z = %f\n", x, y, z);
 			}
 		}
 
-		x_low = (int) floor(x);
-		y_low = (int) floor(y);
-		z_low = (int) floor(z);
-		x_high = (int) ceil(x);
-		y_high = (int) ceil(y);
-		z_high = (int) ceil(z);
-		x_frac = x - x_low;
-		y_frac = y - y_low;
-		z_frac = z - z_low;
-		dx = x_frac;
-		dy = y_frac;
-		dz = z_frac;
-
-		get_trilininterpol_weights_f(weights, &dx, &dy, &dz);
+		// Getting coordinates
+		float x_low  = floor(x);
+		float y_low  = floor(y);
+		float z_low  = floor(z);
+
+		// Grid value at 000
+		const float* grid_value_000 = mygrid->grids.data() + ((unsigned long)(x_low  + y_low*g1  + z_low*g2)<<2);
+
+		float dx = x - x_low;
+		float omdx = 1.0f - dx;
+		float dy = y - y_low;
+		float omdy = 1.0f - dy;
+		float dz = z - z_low;
+		float omdz = 1.0f - dz;
+
+		// Calculating interpolation weights
+		weights [idx_000] = omdx*omdy*omdz;
+		weights [idx_010] = omdx*dy*omdz;
+		weights [idx_001] = omdx*omdy*dz;
+		weights [idx_011] = omdx*dy*dz;
+		weights [idx_100] = dx*omdy*omdz;
+		weights [idx_110] = dx*dy*omdz;
+		weights [idx_101] = dx*omdy*dz;
+		weights [idx_111] = dx*dy*dz;
 
 		if (debug == 1)
 		{
 			printf("\n\nPartial results for atom with id %d:\n", atom_cnt);
-			printf("x_low = %d, x_high = %d, x_frac = %lf\n", x_low, x_high, x_frac);
-			printf("y_low = %d, y_high = %d, y_frac = %lf\n", y_low, y_high, y_frac);
-			printf("z_low = %d, z_high = %d, z_frac = %lf\n\n", z_low, z_high, z_frac);
-			printf("coeff(0,0,0) = %lf\n", weights [0][0][0]);
-			printf("coeff(1,0,0) = %lf\n", weights [1][0][0]);
-			printf("coeff(0,1,0) = %lf\n", weights [0][1][0]);
-			printf("coeff(1,1,0) = %lf\n", weights [1][1][0]);
-			printf("coeff(0,0,1) = %lf\n", weights [0][0][1]);
-			printf("coeff(1,0,1) = %lf\n", weights [1][0][1]);
-			printf("coeff(0,1,1) = %lf\n", weights [0][1][1]);
-			printf("coeff(1,1,1) = %lf\n", weights [1][1][1]);
+			printf("x_low = %d, x_high = %d, x_frac = %f\n", x_low, x_high, x_frac);
+			printf("y_low = %d, y_high = %d, y_frac = %f\n", y_low, y_high, y_frac);
+			printf("z_low = %d, z_high = %d, z_frac = %f\n\n", z_low, z_high, z_frac);
+			printf("coeff(0,0,0) = %f\n", weights [idx_000]);
+			printf("coeff(1,0,0) = %f\n", weights [idx_100]);
+			printf("coeff(0,1,0) = %f\n", weights [idx_010]);
+			printf("coeff(1,1,0) = %f\n", weights [idx_110]);
+			printf("coeff(0,0,1) = %f\n", weights [idx_001]);
+			printf("coeff(1,0,1) = %f\n", weights [idx_101]);
+			printf("coeff(0,1,1) = %f\n", weights [idx_011]);
+			printf("coeff(1,1,1) = %f\n", weights [idx_111]);
 		}
 
-		// energy contribution of the current grid type
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
+		cube[0] = *(grid_value_000+mul_tmp);
+		cube[1] = *(grid_value_000+mul_tmp+1);
+		cube[2] = *(grid_value_000+mul_tmp+2);
+		cube[3] = *(grid_value_000+mul_tmp+3);
+		cube[4] = *(grid_value_000+mul_tmp+4);
+		cube[5] = *(grid_value_000+mul_tmp+5);
+		cube[6] = *(grid_value_000+mul_tmp+6);
+		cube[7] = *(grid_value_000+mul_tmp+7);
 
 		if (debug == 1)
 		{
 			printf("Interpolation of van der Waals map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
+			printf("cube(0,0,0) = %f\n", cube [idx_000]);
+			printf("cube(1,0,0) = %f\n", cube [idx_100]);
+			printf("cube(0,1,0) = %f\n", cube [idx_010]);
+			printf("cube(1,1,0) = %f\n", cube [idx_110]);
+			printf("cube(0,0,1) = %f\n", cube [idx_001]);
+			printf("cube(1,0,1) = %f\n", cube [idx_101]);
+			printf("cube(0,1,1) = %f\n", cube [idx_011]);
+			printf("cube(1,1,1) = %f\n", cube [idx_111]);
 		}
 
-
-		val += trilin_interpol(cube, weights);
+		// energy contribution of the current grid type
+		v = cube[0]*weights[0] + cube[1]*weights[1] + cube[2]*weights[2] + cube[3]*weights[3] + cube[4]*weights[4] + cube[5]*weights[5] + cube[6]*weights[6] + cube[7]*weights[7];
+		val += v;
 
 		if (debug == 1)
-			printf("interpoated value = %lf\n\n", trilin_interpol(cube, weights));
+			printf("interpolated value = %f\n\n", v);
 
 		// energy contribution of the electrostatic grid
-
-		atomtypeid = mygrid->num_of_atypes;
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
+		mul_tmp = mygrid->num_of_map_atypes*g3_4; // relative address of electrostatics map
+		cube[0] = *(grid_value_000+mul_tmp);
+		cube[1] = *(grid_value_000+mul_tmp+1);
+		cube[2] = *(grid_value_000+mul_tmp+2);
+		cube[3] = *(grid_value_000+mul_tmp+3);
+		cube[4] = *(grid_value_000+mul_tmp+4);
+		cube[5] = *(grid_value_000+mul_tmp+5);
+		cube[6] = *(grid_value_000+mul_tmp+6);
+		cube[7] = *(grid_value_000+mul_tmp+7);
 
 		if (debug == 1)
 		{
 			printf("Interpolation of electrostatic map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
+			printf("cube(0,0,0) = %f\n", cube [idx_000]);
+			printf("cube(1,0,0) = %f\n", cube [idx_100]);
+			printf("cube(0,1,0) = %f\n", cube [idx_010]);
+			printf("cube(1,1,0) = %f\n", cube [idx_110]);
+			printf("cube(0,0,1) = %f\n", cube [idx_001]);
+			printf("cube(1,0,1) = %f\n", cube [idx_101]);
+			printf("cube(0,1,1) = %f\n", cube [idx_011]);
+			printf("cube(1,1,1) = %f\n", cube [idx_111]);
 		}
 
-
-		val += q * trilin_interpol(cube, weights);
+		// Calculating affinity energy
+		e = q * (cube[0]*weights[0] + cube[1]*weights[1] + cube[2]*weights[2] + cube[3]*weights[3] + cube[4]*weights[4] + cube[5]*weights[5] + cube[6]*weights[6] + cube[7]*weights[7]);
+		val += e;
+		if(peratom){
+#ifndef AD4_desolv_peratom_vdW
+			peratom_vdw[atom_cnt] = v;
+#endif
+			peratom_elec[atom_cnt] = e;
+			*elecE += e;
+		}
 
 		if (debug == 1)
-			printf("interpoated value = %lf, multiplied by q = %lf\n\n", trilin_interpol(cube, weights), q*trilin_interpol(cube, weights));
-
-		// energy contribution of the desolvation grid
-
-		atomtypeid = mygrid->num_of_atypes+1;
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
+			printf("interpolated value = %f (multiplied by q = %f)\n\n", e, q);
+
+		// Capturing desolvation values (next map compared to above => mul_tmp += g3*4)
+		mul_tmp += g3_4;
+		cube[0] = *(grid_value_000+mul_tmp);
+		cube[1] = *(grid_value_000+mul_tmp+1);
+		cube[2] = *(grid_value_000+mul_tmp+2);
+		cube[3] = *(grid_value_000+mul_tmp+3);
+		cube[4] = *(grid_value_000+mul_tmp+4);
+		cube[5] = *(grid_value_000+mul_tmp+5);
+		cube[6] = *(grid_value_000+mul_tmp+6);
+		cube[7] = *(grid_value_000+mul_tmp+7);
 
 		if (debug == 1)
 		{
 			printf("Interpolation of desolvation map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
+			printf("cube(0,0,0) = %f\n", cube [idx_000]);
+			printf("cube(1,0,0) = %f\n", cube [idx_100]);
+			printf("cube(0,1,0) = %f\n", cube [idx_010]);
+			printf("cube(1,1,0) = %f\n", cube [idx_110]);
+			printf("cube(0,0,1) = %f\n", cube [idx_001]);
+			printf("cube(1,0,1) = %f\n", cube [idx_101]);
+			printf("cube(0,1,1) = %f\n", cube [idx_011]);
+			printf("cube(1,1,1) = %f\n", cube [idx_111]);
 		}
 
-		val += fabs(q) * trilin_interpol(cube, weights);
-
+		// Calculating affinity energy
+		e = fabs(q) * (cube[0]*weights[0] + cube[1]*weights[1] + cube[2]*weights[2] + cube[3]*weights[3] + cube[4]*weights[4] + cube[5]*weights[5] + cube[6]*weights[6] + cube[7]*weights[7]);
+		val += e;
+#ifdef AD4_desolv_peratom_vdW
+		if(peratom) peratom_vdw[atom_cnt] = v + e;
+#endif
 		if (atom_cnt < myligand->true_ligand_atoms)
 			interE += val;
 		else
 			intraflexE += val;
 
-		if (debug == 1)
-			printf("interpoated value = %lf, multiplied by abs(q) = %lf\n\n", trilin_interpol(cube, weights), fabs(q) * trilin_interpol(cube, weights));
-
-		if (debug == 1)
-			printf("Current value of intermolecular energy = %lf\n\n\n", interE);
-	}
-	return interE;
-}
-
-void calc_interE_peratom_f(
-                           const Gridinfo*   mygrid,
-                           const Liganddata* myligand,
-                           const float*      fgrids,
-                                 float       outofgrid_tolerance,
-                                 float*      elecE,
-                                 float       peratom_vdw [MAX_NUM_OF_ATOMS],
-                                 float       peratom_elec [MAX_NUM_OF_ATOMS],
-                                 int         debug
-                          )
-{
-	//float interE;
-	int atom_cnt;
-	float x, y, z;
-	int atomtypeid;
-	int x_low, x_high, y_low, y_high, z_low, z_high;
-	float q, x_frac, y_frac, z_frac;
-	float cube [2][2][2];
-	float weights [2][2][2];
-	float dx, dy, dz;
-
-	//interE = 0;
-	*elecE = 0;
-
-	for (atom_cnt=myligand->num_of_atoms-1; atom_cnt>=0; atom_cnt--)		//for each atom
-	{
-		if (myligand->ignore_inter[atom_cnt])
-			continue;
-		atomtypeid = myligand->base_type_idx[(int)myligand->atom_idxyzq [atom_cnt][0]];
-		x = myligand->atom_idxyzq [atom_cnt][1];
-		y = myligand->atom_idxyzq [atom_cnt][2];
-		z = myligand->atom_idxyzq [atom_cnt][3];
-		q = myligand->atom_idxyzq [atom_cnt][4];
-
-		if ((x < 0) || (x >= mygrid->size_xyz [0]-1) ||
-		    (y < 0) || (y >= mygrid->size_xyz [1]-1) ||
-		    (z < 0) || (z >= mygrid->size_xyz [2]-1)) // if the atom is outside of the grid
-		{
-			if (debug == 1)
-			{
-				printf("\n\nPartial results for atom with id %d:\n", atom_cnt);
-				printf("Atom out of grid: ");
-				printf("x= %lf, y = %lf, z = %lf\n", x, y, z);
-			}
-
-			if (outofgrid_tolerance != 0) // if tolerance is set, try to place atom back into the grid
-			{
-				if (x < 0)
-					x += outofgrid_tolerance;
-				if (y < 0)
-					y += outofgrid_tolerance;
-				if (z < 0)
-					z += outofgrid_tolerance;
-				if (x >= mygrid->size_xyz [0]-1)
-					x -= outofgrid_tolerance;
-				if (y >= mygrid->size_xyz [1]-1)
-					y -= outofgrid_tolerance;
-				if (z >= mygrid->size_xyz [2]-1)
-					z -= outofgrid_tolerance;
-			}
-
-			if ((x < 0) || (x >= mygrid->size_xyz [0]-1) || (y < 0) || (y >= mygrid->size_xyz [1]-1) ||
-						(z < 0) || (z >= mygrid->size_xyz [2]-1)) // check again if the atom is outside of the grid
-			{
-				//interE = HIGHEST_ENERGY; // return maximal value
-				//return interE;
-				//interE += 16777216; // penalty is 2^24 for each atom outside the grid
-				peratom_vdw[atom_cnt] = 100000;
-				peratom_elec[atom_cnt] = 100000;
-				continue;
-			}
-
-			if (debug == 1)
-			{
-				printf("\n\nAtom was placed back into the grid according to the tolerance value %f:\n", outofgrid_tolerance);
-				printf("x= %lf, y = %lf, z = %lf\n", x, y, z);
-			}
-		}
-
-		x_low = (int) floor(x);
-		y_low = (int) floor(y);
-		z_low = (int) floor(z);
-		x_high = (int) ceil(x);
-		y_high = (int) ceil(y);
-		z_high = (int) ceil(z);
-		x_frac = x - x_low;
-		y_frac = y - y_low;
-		z_frac = z - z_low;
-		dx = x_frac;
-		dy = y_frac;
-		dz = z_frac;
-
-		get_trilininterpol_weights_f(weights, &dx, &dy, &dz);
-
-		if (debug == 1)
-		{
-			printf("\n\nPartial results for atom with id %d:\n", atom_cnt);
-			printf("x_low = %d, x_high = %d, x_frac = %lf\n", x_low, x_high, x_frac);
-			printf("y_low = %d, y_high = %d, y_frac = %lf\n", y_low, y_high, y_frac);
-			printf("z_low = %d, z_high = %d, z_frac = %lf\n\n", z_low, z_high, z_frac);
-			printf("coeff(0,0,0) = %lf\n", weights [0][0][0]);
-			printf("coeff(1,0,0) = %lf\n", weights [1][0][0]);
-			printf("coeff(0,1,0) = %lf\n", weights [0][1][0]);
-			printf("coeff(1,1,0) = %lf\n", weights [1][1][0]);
-			printf("coeff(0,0,1) = %lf\n", weights [0][0][1]);
-			printf("coeff(1,0,1) = %lf\n", weights [1][0][1]);
-			printf("coeff(0,1,1) = %lf\n", weights [0][1][1]);
-			printf("coeff(1,1,1) = %lf\n", weights [1][1][1]);
-		}
-
-		// energy contribution of the current grid type
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
-
-		if (debug == 1)
-		{
-			printf("Interpolation of van der Waals map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
+		if (debug == 1){
+			printf("interpolated value = %f (multiplied by abs(q) = %f)\n\n", e, fabs(q));
+			printf("Current value of intermolecular energy = %f, intramolecular flex res energy = %f\n\n\n", interE, intraflexE);
 		}
-
-
-		//interE += trilin_interpol(cube, weights);
-		peratom_vdw[atom_cnt] = trilin_interpol(cube, weights);
-
-		if (debug == 1)
-			printf("interpolated value = %lf\n\n", trilin_interpol(cube, weights));
-
-		// energy contribution of the electrostatic grid
-
-		atomtypeid = mygrid->num_of_atypes;
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
-
-		if (debug == 1)
-		{
-			printf("Interpolation of electrostatic map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
-		}
-
-
-		//interE += q * trilin_interpol(cube, weights);
-		peratom_elec[atom_cnt] = q * trilin_interpol(cube, weights);
-		*elecE += q * trilin_interpol(cube, weights);
-
-		if (debug == 1)
-			printf("interpolated value = %lf, multiplied by q = %lf\n\n", trilin_interpol(cube, weights), q*trilin_interpol(cube, weights));
-
-#ifdef AD4_desolv_peratom_vdW
-		// energy contribution of the desolvation grid
-		atomtypeid = mygrid->num_of_atypes+1;
-
-		cube [0][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_low);
-		cube [1][0][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_low, x_high);
-		cube [0][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_low);
-		cube [1][1][0] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_low, y_high, x_high);
-		cube [0][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_low);
-		cube [1][0][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_low, x_high);
-		cube [0][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_low);
-		cube [1][1][1] = getvalue_4Darr(fgrids, *mygrid, atomtypeid, z_high, y_high, x_high);
-
-		if (debug == 1)
-		{
-			printf("Interpolation of desolvation map:\n");
-			printf("cube(0,0,0) = %lf\n", cube [0][0][0]);
-			printf("cube(1,0,0) = %lf\n", cube [1][0][0]);
-			printf("cube(0,1,0) = %lf\n", cube [0][1][0]);
-			printf("cube(1,1,0) = %lf\n", cube [1][1][0]);
-			printf("cube(0,0,1) = %lf\n", cube [0][0][1]);
-			printf("cube(1,0,1) = %lf\n", cube [1][0][1]);
-			printf("cube(0,1,1) = %lf\n", cube [0][1][1]);
-			printf("cube(1,1,1) = %lf\n", cube [1][1][1]);
-		}
-
-		peratom_vdw[atom_cnt] += fabs(q) * trilin_interpol(cube, weights);
-
-		if (debug == 1)
-			printf("interpolated value = %lf, multiplied by abs(q) = %lf\n\n", trilin_interpol(cube, weights), fabs(q) * trilin_interpol(cube, weights));
-#endif
 	}
+	return interE;
 }
 
 // Corrected host "calc_intraE_f" function after smoothing was added
@@ -2264,11 +2069,9 @@ float calc_intraE_f(
                           float                     smooth,
                           bool                      ignore_desolv,
                     const float                     elec_min_distance,
-                          IntraTables&              tables,
+                          IntraTables*              tables,
                           int                       debug,
                           float&                    interflexE,
-                          int                       nr_mod_atype_pairs,
-                          pair_mod*                 mod_atype_pairs,
                           std::vector<AnalysisData> *analysis,
                     const ReceptorAtom*             flexres_atoms,
                           float                     R_cutoff,
@@ -2292,7 +2095,7 @@ float calc_intraE_f(
 	float vW, el, desolv;
 	bool analyze = (analysis!=NULL);
 	bool a_flex, b_flex;
-	int atomtypeid;
+	int atom_typeid;
 	bool flex_reactive;
 	AnalysisData datum;
 
@@ -2309,15 +2112,20 @@ float calc_intraE_f(
 		a_flex = (atom_id1>=myligand->true_ligand_atoms);
 		for (atom_id2=atom_id1+1; atom_id2<myligand->num_of_atoms; atom_id2++)
 		{
-			b_flex = (atom_id2>=myligand->true_ligand_atoms);
 			if (myligand->intraE_contributors [atom_id1][atom_id2] == 1) // if they have to be included in intramolecular energy calculation
 			{                                                            // the energy contribution has to be calculated
+				b_flex = (atom_id2>=myligand->true_ligand_atoms);
+
 				dist = distance(&(myligand->atom_idxyzq [atom_id1][1]), &(myligand->atom_idxyzq [atom_id2][1]));
+				distance_id = (int) floor((100.0f*dist) + 0.5f) - 1; // +0.5: rounding, -1: r_xx_table [0] corresponds to r=0.01
+				if (distance_id < 0) {
+					distance_id = 0;
+				}
 
 				if (debug == 1)
 				{
 					printf("\n\nCalculating energy contribution of atoms %d and %d\n", atom_id1+1, atom_id2+1);
-					printf("Distance: %lf\n", dist);
+					printf("Distance: %f\n", dist);
 				}
 
 				// Adding smoothing
@@ -2329,34 +2137,6 @@ float calc_intraE_f(
 				unsigned int atom1_type_vdw_hb = myligand->atom_types_reqm [type_id1];
 				unsigned int atom2_type_vdw_hb = myligand->atom_types_reqm [type_id2];
 
-				// Getting optimum pair distance (opt_distance) from reqm and reqm_hbond
-				float opt_distance = myligand->reqm_AB [type_id1][type_id2];
-
-				// Getting smoothed distance
-				// smoothed_distance = function(dist, opt_distance)
-				float smoothed_distance;
-				float delta_distance = 0.5f*smooth;
-
-				if (dist <= (opt_distance - delta_distance)) {
-					smoothed_distance = dist + delta_distance;
-				}
-				else if (dist < (opt_distance + delta_distance)) {
-					smoothed_distance = opt_distance;
-				}
-				else { // else if (dist >= (opt_distance + delta_distance))
-					smoothed_distance = dist - delta_distance;
-				}
-
-				distance_id = (int) floor((100.0f*dist) + 0.5f) - 1; // +0.5: rounding, -1: r_xx_table [0] corresponds to r=0.01
-				if (distance_id < 0) {
-					distance_id = 0;
-				}
-
-				smoothed_distance_id = (int) floor((100.0f*smoothed_distance) + 0.5f) - 1; // +0.5: rounding, -1: r_xx_table [0] corresponds to r=0.01
-				if (smoothed_distance_id < 0) {
-					smoothed_distance_id = 0;
-				}
-
 				// ------------------------------------------------
 				// Required only for flexrings
 				// Checking if this is a CG-G0 atomic pair.
@@ -2365,7 +2145,7 @@ float calc_intraE_f(
 				// This interaction is evaluated at any distance,
 				// so no cuttoffs considered here!
 				// FIXME: accumulated into vW ... is that correct?
-				if (((atom1_type_vdw_hb == ATYPE_CG_IDX) && (atom2_type_vdw_hb == ATYPE_G0_IDX)) || 
+				if (((atom1_type_vdw_hb == ATYPE_CG_IDX) && (atom2_type_vdw_hb == ATYPE_G0_IDX)) ||
 				    ((atom1_type_vdw_hb == ATYPE_G0_IDX) && (atom2_type_vdw_hb == ATYPE_CG_IDX))) {
 					if (((atom_id1<myligand->true_ligand_atoms) && (atom_id2<myligand->true_ligand_atoms)) ||
 					    ((atom_id1>=myligand->true_ligand_atoms) && (atom_id2>=myligand->true_ligand_atoms))) // if both atoms are of either a ligand or a flex res it's intra
@@ -2377,17 +2157,40 @@ float calc_intraE_f(
 				// ------------------------------------------------
 				if (dist < dcutoff) // but only if the distance is less than distance cutoff value
 				{
-					pair_mod* pm = is_mod_pair(myligand->atom_types[type_id1], myligand->atom_types[type_id2], nr_mod_atype_pairs, mod_atype_pairs);
-					if (tables.is_HB [type_id1][type_id2] && !pm) //H-bond
+					// Getting optimum pair distance (opt_distance) from reqm and reqm_hbond
+					float opt_distance = myligand->reqm_AB [type_id1][type_id2];
+
+					// Getting smoothed distance
+					// smoothed_distance = function(dist, opt_distance)
+					float smoothed_distance;
+					float delta_distance = 0.5f*smooth;
+
+					if (dist <= (opt_distance - delta_distance)) {
+						smoothed_distance = dist + delta_distance;
+					}
+					else if (dist < (opt_distance + delta_distance)) {
+						smoothed_distance = opt_distance;
+					}
+					else { // else if (dist >= (opt_distance + delta_distance))
+						smoothed_distance = dist - delta_distance;
+					}
+
+					smoothed_distance_id = (int) floor((100.0f*smoothed_distance) + 0.5f) - 1; // +0.5: rounding, -1: r_xx_table [0] corresponds to r=0.01
+					if (smoothed_distance_id < 0) {
+						smoothed_distance_id = 0;
+					}
+
+					pair_mod* pm = tables->mod_pair [type_id1][type_id2];
+					if (tables->is_HB [type_id1][type_id2] && !pm) //H-bond
 					{
-						vdW1 = myligand->VWpars_C [type_id1][type_id2]*tables.r_12_table [smoothed_distance_id];
-						vdW2 = myligand->VWpars_D [type_id1][type_id2]*tables.r_10_table [smoothed_distance_id];
+						vdW1 = myligand->VWpars_C [type_id1][type_id2]*tables->r_12_table [smoothed_distance_id];
+						vdW2 = myligand->VWpars_D [type_id1][type_id2]*tables->r_10_table [smoothed_distance_id];
 						if (debug == 1) printf("H-bond interaction = ");
 					}
 					else // normal van der Waals or mod pair
 					{
-						float r_A = tables.r_12_table [smoothed_distance_id];
-						float r_B = tables.r_6_table  [smoothed_distance_id];
+						float r_A = tables->r_12_table [smoothed_distance_id];
+						float r_B = tables->r_6_table  [smoothed_distance_id];
 						if(pm){
 							int m = (myligand->VWpars_exp [type_id1][type_id2] & 0xFF00) >> 8;
 							int n = (myligand->VWpars_exp [type_id1][type_id2] & 0xFF);
@@ -2409,12 +2212,12 @@ float calc_intraE_f(
 						if (analyze){
 							const ReceptorAtom* curr;
 							if(a_flex){ // a is flexres, b is ligand
-								atomtypeid = myligand->base_type_idx[type_id2];
+								atom_typeid = myligand->base_type_idx[type_id2];
 								atom_cnt = atom_id2;
 								curr = &flexres_atoms[atom_id1-myligand->true_ligand_atoms];
 								flex_reactive = myligand->reactive[atom_id1];
 							} else{ // a is ligand, b is flexres
-								atomtypeid = myligand->base_type_idx[type_id1];
+								atom_typeid = myligand->base_type_idx[type_id1];
 								atom_cnt = atom_id1;
 								curr = &flexres_atoms[atom_id2-myligand->true_ligand_atoms];
 								flex_reactive = myligand->reactive[atom_id2];
@@ -2444,7 +2247,7 @@ float calc_intraE_f(
 										analysis->push_back(datum);
 									}
 								} else{
-									if((myligand->base_atom_types[atomtypeid][0]!='H') && (curr->atom_type[0]!='H') && // exclude Hydrogens,
+									if((myligand->base_atom_types[atom_typeid][0]!='H') && (curr->atom_type[0]!='H') && // exclude Hydrogens,
 									   !myligand->acceptor[atom_cnt] && !myligand->donor[atom_cnt] &&                  // non-H-bond capable atoms on ligand,
 									   !curr->acceptor && !curr->donor){                                               // as well as flexres
 										if(dist <= V_cutoff){
@@ -2472,21 +2275,21 @@ float calc_intraE_f(
 						dist=elec_min_distance;
 						distance_id = (int) floor((100*dist) + 0.5) - 1; // +0.5: rounding, -1: r_xx_table [0] corresponds to r=0.01
 					}
-					s1 = (myligand->solpar [type_id1] + tables.qasp_mul_absq [atom_id1]);
-					s2 = (myligand->solpar [type_id2] + tables.qasp_mul_absq [atom_id2]);
+					s1 = (myligand->solpar [type_id1] + tables->qasp_mul_absq [atom_id1]);
+					s2 = (myligand->solpar [type_id2] + tables->qasp_mul_absq [atom_id2]);
 					v1 = myligand->volume [type_id1];
 					v2 = myligand->volume [type_id2];
 
 					if (debug == 1)
-						printf(" %lf, electrostatic = %lf, desolv = %lf\n", (vdW1 - vdW2), tables.q1q2[atom_id1][atom_id2] * tables.r_epsr_table [distance_id],
-							   (s1*v2 + s2*v1) * tables.desolv_table [distance_id]);
+						printf(" %f, electrostatic = %f, desolv = %f\n", (vdW1 - vdW2), tables->q1q2[atom_id1][atom_id2] * tables->r_epsr_table [distance_id],
+							   (s1*v2 + s2*v1) * tables->desolv_table [distance_id]);
 
 					if ((a_flex + b_flex) & 1){ // if both atoms are of either a ligand or a flex res it's intra
-						interflexE += tables.q1q2[atom_id1][atom_id2] * tables.r_epsr_table [distance_id] +
-						              (s1*v2 + s2*v1) * tables.desolv_table [distance_id];
+						interflexE += tables->q1q2[atom_id1][atom_id2] * tables->r_epsr_table [distance_id] +
+						              (s1*v2 + s2*v1) * tables->desolv_table [distance_id];
 					} else{
-						el += tables.q1q2[atom_id1][atom_id2] * tables.r_epsr_table [distance_id];
-						desolv += (s1*v2 + s2*v1) * tables.desolv_table [distance_id];
+						el += tables->q1q2[atom_id1][atom_id2] * tables->r_epsr_table [distance_id];
+						desolv += (s1*v2 + s2*v1) * tables->desolv_table [distance_id];
 					}
 				}
 			}
@@ -2494,35 +2297,10 @@ float calc_intraE_f(
 	}
 
 	if (debug == 1)
-		printf("\nFinal energies: van der Waals = %lf, electrostatic = %lf, desolvation = %lf, total = %lf\n\n", vW, el, desolv, vW + el + desolv);
+		printf("\nFinal energies: van der Waals = %f, electrostatic = %f, desolvation = %f, total = %f\n\n", vW, el, desolv, vW + el + desolv);
 
 	if (!ignore_desolv)
 		return (vW + el + desolv);
 	else
 		return (vW + el);
 }
-
-int map_to_all_maps(
-                    Gridinfo*         mygrid,
-                    Liganddata*       myligand,
-                    std::vector<Map>& all_maps
-                   )
-{
-	for (int i_atom = 0; i_atom<myligand->num_of_atoms;i_atom++){
-		int type = myligand->atom_idxyzq[i_atom][0];
-		int type_idx = myligand->base_type_idx[type];
-		int map_idx = -1;
-		for (unsigned int i_map = 0; i_map<all_maps.size(); i_map++){
-			if (strcmp(all_maps[i_map].atype.c_str(),mygrid->grid_types[type_idx])==0){
-				map_idx = i_map;
-				break;
-			}
-		}
-		if (map_idx == -1) {printf("\nERROR: Did not map to all_maps correctly."); return 1;}
-
-		myligand->atom_map_to_fgrids[i_atom] = map_idx;
-//		printf("\nMapping atom %d (type %d, %s) in the ligand to map #%d (%s)",i_atom,type_idx,mygrid->grid_types[type_idx],map_idx,all_maps[map_idx].atype.c_str());
-	}
-
-	return 0;
-}
diff --git a/host/src/processresult.cpp b/host/src/processresult.cpp
index 409fe8a8..8f5e0385 100644
--- a/host/src/processresult.cpp
+++ b/host/src/processresult.cpp
@@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 
 #include <stdio.h>
+#include <errno.h>
 #include "processresult.h"
 
 
@@ -59,7 +60,7 @@ void arrange_result(
 
 void write_basic_info(
                             FILE*       fp,
-                      const Liganddata* ligand_ref,
+                            Liganddata* ligand_ref,
                       const Dockpars*   mypars,
                       const Gridinfo*   mygrid,
                       const int*        argc,
@@ -122,17 +123,33 @@ void write_basic_info(
 	else
 		fprintf(fp, "LOAD FROM FILE (%s)\n",mypars->load_xml);
 
+#ifndef TOOLMODE
 	fprintf(fp, "\n\nProgram call in command line was:          ");
-	for (i=0; i<*argc; i++)
+	for (i=0; i<*argc; i++){
 		fprintf(fp, "%s ", argv [i]);
-	fprintf(fp, "\n\n\n");
+		if (argcmp("filelist", argv[i], 'B')){
+			if(mypars->filelist_files>1){
+				fprintf(fp, "%s ", mypars->ligandfile);
+				i+=mypars->filelist_files; // skip ahead in case there are multiple entries here
+			}
+		}
+		if (argcmp("xml2dlg", argv[i], 'X')){
+			if(mypars->xml_files>1){
+				fprintf(fp, "%s ", mypars->load_xml);
+				i+=mypars->xml_files; // skip ahead in case there are multiple entries here
+			}
+		}
+	}
+	fprintf(fp, "\n\n");
+#endif
+	fprintf(fp, "\n");
 
 	// Writing out receptor parameters
 
 	fprintf(fp, "        RECEPTOR PARAMETERS        \n");
 	fprintf(fp, "===================================\n\n");
 
-	fprintf(fp, "Receptor name:                             %s\n", mygrid->receptor_name);
+	fprintf(fp, "Receptor name:                             %s\n", mygrid->receptor_name.c_str());
 	fprintf(fp, "Number of grid points (x, y, z):           %d, %d, %d\n", mygrid->size_xyz [0], mygrid->size_xyz [1], mygrid->size_xyz [2]);
 	fprintf(fp, "Grid size (x, y, z):                       %lf, %lf, %lfA\n", mygrid->size_xyz_angstr [0], mygrid->size_xyz_angstr [1], mygrid->size_xyz_angstr [2]);
 	fprintf(fp, "Grid spacing:                              %lfA\n", mygrid->spacing);
@@ -172,7 +189,7 @@ void write_basic_info(
 
 void write_basic_info_dlg(
                                 FILE*       fp,
-                          const Liganddata* ligand_ref,
+                                Liganddata* ligand_ref,
                           const Dockpars*   mypars,
                           const Gridinfo*   mygrid,
                           const int*        argc,
@@ -237,19 +254,33 @@ void write_basic_info_dlg(
 
 	fprintf(fp, "RMSD tolerance:                            %lfA\n\n", mypars->rmsd_tolerance);
 
-	if(!mypars->xml2dlg){ // This is necessary to avoid excruciatingly long command line outputs with wild cards (like *.xml)
-		fprintf(fp, "Program call in command line was:          ");
-		for (i=0; i<*argc; i++)
-			fprintf(fp, "%s ", argv [i]);
-		fprintf(fp, "\n\n\n");
+#ifndef TOOLMODE
+	fprintf(fp, "Program call in command line was:          ");
+	for (i=0; i<*argc; i++){
+		fprintf(fp, "%s ", argv [i]);
+		if (argcmp("filelist", argv[i], 'B')){
+			if(mypars->filelist_files>1){
+				fprintf(fp, "%s ", mypars->ligandfile);
+				i+=mypars->filelist_files; // skip ahead in case there are multiple entries here
+			}
+		}
+		if (argcmp("xml2dlg", argv[i], 'X')){
+			if(mypars->xml_files>1){
+				fprintf(fp, "%s ", mypars->load_xml);
+				i+=mypars->xml_files; // skip ahead in case there are multiple entries here
+			}
+		}
 	}
+	fprintf(fp, "\n\n");
+#endif
+	fprintf(fp, "\n");
 
 	// Writing out receptor parameters
 
 	fprintf(fp, "    GRID PARAMETERS\n");
 	fprintf(fp, "    ________________________\n\n\n");
 
-	fprintf(fp, "Receptor name:                             %s\n", mygrid->receptor_name);
+	fprintf(fp, "Receptor name:                             %s\n", mygrid->receptor_name.c_str());
 	fprintf(fp, "Number of grid points (x, y, z):           %d, %d, %d\n", mygrid->size_xyz [0],
 			mygrid->size_xyz [1], mygrid->size_xyz [2]);
 	fprintf(fp, "Grid size (x, y, z):                       %lf, %lf, %lfA\n", mygrid->size_xyz_angstr [0],
@@ -288,7 +319,7 @@ void write_basic_info_dlg(
 		fprintf(fp, "    ________________________\n\n\n");
 		fprintf(fp, "DPF> outlev 1\n");
 		fprintf(fp, "DPF> ga_run %lu\n", mypars->num_of_runs);
-		fprintf(fp, "DPF> fld %s.maps.fld\n", mygrid->receptor_name);
+		fprintf(fp, "DPF> fld %s.maps.fld\n", mygrid->receptor_name.c_str());
 		fprintf(fp, "DPF> move %s\n", mypars->ligandfile);
 		if(flexres) fprintf(fp, "DPF> flexres %s\n", mypars->flexresfile);
 		fprintf(fp, "\n\n");
@@ -298,14 +329,14 @@ void write_basic_info_dlg(
 void make_resfiles(
                          float*        final_population,
                          float*        energies,
-                   const Liganddata*   ligand_ref,
-                   const Liganddata*   ligand_from_pdb,
+                         IntraTables*  tables,
+                         Liganddata*   ligand_ref,
+                         Liganddata*   ligand_from_pdb,
                    const Liganddata*   ligand_xray,
                    const Dockpars*     mypars,
                          int           evals_performed,
                          int           generations_used,
                    const Gridinfo*     mygrid,
-                   const float*        grids,
                    const int*          argc,
                          char**        argv,
                          int           debug,
@@ -317,7 +348,7 @@ void make_resfiles(
 // as well as different parameters about the docking, the receptor and the ligand to a file called fdock_report.txt in a
 // readable and understandable format. The ligand_from_pdb parametere must be the Liganddata which includes the original
 // ligand conformation as the result conformations will be compared to this one. The structs containing the grid informations
-// and docking parameters are requrided as well as the number and values of command line arguments. The ligand_ref parameter
+// and docking parameters are required as well as the number and values of command line arguments. The ligand_ref parameter
 // describes the ligand with the reference orientation (gene values of final_population refer to this one, that is, this can
 // be moved and rotated according to the genotype values). The function returns some information about the best result wich
 // was found with the best_result parameter.
@@ -325,11 +356,11 @@ void make_resfiles(
 	FILE* fp = stdout; // takes care of compile warning down below (and serves as a visual bug tracker in case fp is written to accidentally)
 	int i,j;
 	double entity_rmsds;
-	Liganddata temp_docked;
+	double init_atom_idxyzq[MAX_NUM_OF_ATOMS][5]; // type id .. 0, x .. 1, y .. 2, z .. 3, q ... 4
+	memcpy(init_atom_idxyzq, ligand_ref->atom_idxyzq, sizeof(ligand_ref->atom_idxyzq));
 	int len = strlen(mypars->ligandfile) - 6 + 24 + 10 + 10; // length with added bits for things below (numbers below 11 digits should be a safe enough threshold)
 	char* temp_filename = (char*)malloc((len+1)*sizeof(char)); // +\0 at the end
 	char* name_ext_start;
-	std::vector<AnalysisData> analysis;
 	float accurate_interE;
 	float accurate_intraflexE;
 	float accurate_intraE;
@@ -342,6 +373,10 @@ void make_resfiles(
 	if (mypars->gen_finalpop) // if final population files are not required, no file will be opened.
 	{
 		fp = fopen(temp_filename, "w");
+		if(fp==NULL){
+			printf("Error: Cannot create file %s for output of final population: %s\n",temp_filename,strerror(errno));
+			exit(5);
+		}
 
 		write_basic_info(fp, ligand_ref, mypars, mygrid, argc, argv); // Write basic information about docking and molecule parameters to file
 
@@ -369,10 +404,24 @@ void make_resfiles(
 	strcpy(temp_filename, mypars->ligandfile);
 	name_ext_start = temp_filename + strlen(mypars->ligandfile) - 6; // without .pdbqt
 
-	IntraTables tables(ligand_ref, mypars->coeffs.scaled_AD4_coeff_elec, mypars->coeffs.AD4_coeff_desolv, mypars->qasp);
+	bool rmsd_valid = true;
+	if (mypars->given_xrayligandfile == true) {
+		if(!((ligand_xray->num_of_atoms == ligand_ref->num_of_atoms) || (ligand_xray->num_of_atoms == ligand_ref->true_ligand_atoms))){
+			printf("Warning: RMSD can't be calculated, atom number mismatch %d (ref) vs. %d!\n",ligand_xray->true_ligand_atoms,ligand_ref->true_ligand_atoms);
+			rmsd_valid = false;
+		}
+	}
+	else {
+		if(ligand_from_pdb->true_ligand_atoms != ligand_ref->true_ligand_atoms){
+			printf("Warning: RMSD can't be calculated, atom number mismatch %d (ref) vs. %d!\n",ligand_xray->true_ligand_atoms,ligand_ref->true_ligand_atoms);
+			rmsd_valid = false;
+		}
+	}
+
 	for (i=0; i<pop_size; i++)
 	{
-		temp_docked = *ligand_ref;
+		// start from original coordinates
+		memcpy(ligand_ref->atom_idxyzq, init_atom_idxyzq, sizeof(ligand_ref->atom_idxyzq));
 		
 		if(mypars->xml2dlg){
 			double axisangle[4];
@@ -384,39 +433,37 @@ void make_resfiles(
 			axisangle[1] = genotype[4];
 			axisangle[2] = genotype[5];
 			axisangle[3] = genotype[GENOTYPE_LENGTH_IN_GLOBMEM-1];
-			change_conform(&temp_docked, mygrid, genotype, axisangle, debug);
+			change_conform(ligand_ref, mygrid, genotype, axisangle, debug);
 		} else{
-			change_conform_f(&temp_docked, mygrid, final_population+i*GENOTYPE_LENGTH_IN_GLOBMEM, debug);
+			change_conform_f(ligand_ref, mygrid, final_population+i*GENOTYPE_LENGTH_IN_GLOBMEM, debug);
 		}
-		
 		// the map interaction of flex res atoms is stored in accurate_intraflexE
-		accurate_interE = calc_interE_f(mygrid, &temp_docked, grids, 0.0005, debug, accurate_intraflexE);	//calculating the intermolecular energy
+		if (i == 0)
+			accurate_interE = calc_interE_f(mygrid, ligand_ref, 0.0005, debug, accurate_intraflexE, &(best_result->interE_elec), best_result->peratom_vdw, best_result->peratom_elec); // calculate intermolecular and per atom energies
+		else
+			accurate_interE = calc_interE_f(mygrid, ligand_ref, 0.0005, debug, accurate_intraflexE); // calculating the intermolecular energy
 
 		if (mypars->contact_analysis && (i==0)){
-			analysis = analyze_ligand_receptor(mygrid, &temp_docked, mypars->receptor_atoms.data(), mypars->receptor_map, mypars->receptor_map_list, 0.0005, debug, mypars->H_cutoff, mypars->V_cutoff);
+			best_result->analysis = analyze_ligand_receptor(mygrid, ligand_ref, mypars->receptor_atoms.data(), mypars->receptor_map, mypars->receptor_map_list, 0.0005, debug, mypars->H_cutoff, mypars->V_cutoff);
 		}
 
-		if (i == 0) // additional calculations for ADT-compatible result file, only in case of best conformation
-			calc_interE_peratom_f(mygrid, &temp_docked, grids, 0.0005, &(best_result->interE_elec), best_result->peratom_vdw, best_result->peratom_elec, debug);
+		scale_ligand(ligand_ref, mygrid->spacing);
 
-		scale_ligand(&temp_docked, mygrid->spacing);
-		
 		// the interaction between flex res and ligand is stored in accurate_interflexE
 		if(mypars->contact_analysis && (i==0))
-			accurate_intraE = calc_intraE_f(&temp_docked, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, debug, accurate_interflexE, mypars->nr_mod_atype_pairs, mypars->mod_atype_pairs, &analysis, mypars->receptor_atoms.data() + mypars->nr_receptor_atoms, mypars->R_cutoff, mypars->H_cutoff, mypars->V_cutoff);
+			accurate_intraE = calc_intraE_f(ligand_ref, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, debug, accurate_interflexE, &(best_result->analysis), mypars->receptor_atoms.data() + mypars->nr_receptor_atoms, mypars->R_cutoff, mypars->H_cutoff, mypars->V_cutoff);
 		else
-			accurate_intraE = calc_intraE_f(&temp_docked, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, debug, accurate_interflexE, mypars->nr_mod_atype_pairs, mypars->mod_atype_pairs);
+			accurate_intraE = calc_intraE_f(ligand_ref, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, debug, accurate_interflexE);
 
-		move_ligand(&temp_docked, mygrid->origo_real_xyz, mygrid->origo_real_xyz); //moving it according to grid location
+		move_ligand(ligand_ref, mygrid->origo_real_xyz, mygrid->origo_real_xyz); //moving it according to grid location
 
-//		for (unsigned int atom_id=0; atom_id < temp_docked.num_of_atoms; atom_id++)
-//			printf("%i: %lf, %lf, %lf\n", atom_id+1, temp_docked.atom_idxyzq [atom_id][1], temp_docked.atom_idxyzq [atom_id][2], temp_docked.atom_idxyzq [atom_id][3]);
-
-		if (mypars->given_xrayligandfile == true) {
-			entity_rmsds = calc_rmsd(ligand_xray, &temp_docked, mypars->handle_symmetry); //calculating rmds compared to original xray file
-		}
-		else {
-			entity_rmsds = calc_rmsd(ligand_from_pdb, &temp_docked, mypars->handle_symmetry); //calculating rmds compared to original pdb file
+		if ((mypars->gen_finalpop) || (i==0)){ // rmsd value is only needed in either of those cases
+			if (rmsd_valid){
+				if (mypars->given_xrayligandfile)
+					entity_rmsds = calc_rmsd(ligand_xray->atom_idxyzq, ligand_ref->atom_idxyzq, ligand_xray->num_of_atoms, mypars->handle_symmetry); //calculating rmds compared to original xray file
+				else
+					entity_rmsds = calc_rmsd(ligand_from_pdb->atom_idxyzq, ligand_ref->atom_idxyzq, ligand_from_pdb->true_ligand_atoms, mypars->handle_symmetry); //calculating rmds compared to original pdb file
+			} else entity_rmsds = 100000;
 		}
 
 		// copying best result to output parameter
@@ -427,10 +474,16 @@ void make_resfiles(
 			best_result->interflexE = accurate_interflexE;
 			best_result->intraE = accurate_intraE;
 			best_result->intraflexE = accurate_intraflexE;
-			best_result->reslig_realcoord = temp_docked;
+			memcpy(best_result->atom_idxyzq, ligand_ref->atom_idxyzq, sizeof(ligand_ref->atom_idxyzq));
 			best_result->rmsd_from_ref = entity_rmsds;
 			best_result->run_number = run_cnt+1;
-			if(mypars->contact_analysis) best_result->analysis = analysis;
+			if(mypars->contact_analysis){
+				// sort by analysis type
+				for(unsigned int j=0; j<best_result->analysis.size(); j++)
+					for(unsigned int k=0; k<best_result->analysis.size()-j-1; k++)
+						if(best_result->analysis[k].type>best_result->analysis[k+1].type) // percolate larger types numbers up
+							std::swap(best_result->analysis[k], best_result->analysis[k+1]);
+			}
 		}
 
 		// generating best.pdbqt
@@ -440,13 +493,13 @@ void make_resfiles(
 				best_energy_of_all = accurate_interE + accurate_intraE;
 
 				if (mypars->gen_best)
-					gen_new_pdbfile(mypars->ligandfile, "best.pdbqt", &temp_docked);
+					gen_new_pdbfile(mypars->ligandfile, "best.pdbqt", ligand_ref);
 			}
 
 		if (i < mypars->gen_pdbs) //if it is necessary, making new pdbqts for best entities
 		{
 			sprintf(name_ext_start, "_docked_run%d_entity%d.pdbqt", run_cnt+1, i+1); //name will be <original pdb filename>_docked_<number starting from 1>.pdb
-			gen_new_pdbfile(mypars->ligandfile, temp_filename, &temp_docked);
+			gen_new_pdbfile(mypars->ligandfile, temp_filename, ligand_ref);
 		}
 		if (mypars->gen_finalpop)
 		{
@@ -464,158 +517,136 @@ void make_resfiles(
 			fprintf(fp, " %8.3lf | \n", entity_rmsds);
 		}
 	}
+	// need to restore ligand_ref to original coordinates before we leave
+	memcpy(ligand_ref->atom_idxyzq, init_atom_idxyzq, sizeof(ligand_ref->atom_idxyzq));
 	if (mypars->gen_finalpop) fclose(fp);
 	free(temp_filename);
 }
 
-void cluster_analysis(
-                            Ligandresult myresults [],
-                            int          num_of_runs,
-                            char*        report_file_name,
-                      const Liganddata*  ligand_ref,
-                      const Dockpars*    mypars,
-                      const Gridinfo*    mygrid,
-                      const int*         argc,
-                            char**       argv,
-                      const double       docking_avg_runtime,
-                      const double program_runtime
-                     )
-// The function performs ranked cluster analisys similar to that of AutoDock and creates a file with report_file_name name, the result
-// will be written to it.
+void ligand_calc_output(
+                              FILE*         fp,
+                        const char*         prefix,
+                              IntraTables*  tables,
+                        const Liganddata*   ligand,
+                        const Dockpars*     mypars,
+                        const Gridinfo*     mygrid,
+                              bool          output_analysis,
+                              bool          output_energy
+                       )
 {
-	int i,j;
-	Ligandresult temp_ligres;
-	int num_of_clusters;
-	int current_clust_center;
-	double temp_rmsd;
-	double cluster_tolerance = 2;
-	int result_clustered;
-	int subrank;
-	FILE* fp;
-	int cluster_sizes [1000];
-	double sum_energy [1000];
-	double best_energy [1000];
-
-	const double AD4_coeff_tors = mypars->coeffs.AD4_coeff_tors;
-	double torsional_energy;
-
-	// first of all, let's calculate the constant torsional free energy term
-	torsional_energy = AD4_coeff_tors * ligand_ref->true_ligand_rotbonds;
-
-	// arranging results according to energy, myresults [0] will be the best one (with lowest energy)
-	for (j=0; j<num_of_runs-1; j++)
-		for (i=num_of_runs-2; i>=j; i--) // arrange according to sum of inter- and intramolecular energies
-			if ((myresults [i]).interE /*+ (myresults [i]).intraE*/ > (myresults [i+1]).interE /*+ (myresults [i+1]).intraE*/) // mimics the behaviour of AD4 unbound_same_as_bound
-			//if ((myresults [i]).interE + (myresults [i]).intraE > (myresults [i+1]).interE + (myresults [i+1]).intraE)
-			{
-				temp_ligres = myresults [i];
-				myresults [i] = myresults [i+1];
-				myresults [i+1] = temp_ligres;
-			}
-
-	for (i=0; i<num_of_runs; i++)
-	{
-		(myresults [i]).clus_id = 0; // indicates that it hasn't been put into cluster yet
+	Liganddata calc_lig = *ligand;
+	Ligandresult calc;
+	double orig_vec[3];
+	for (unsigned int i=0; i<3; i++)
+		orig_vec [i] = -mygrid->origo_real_xyz [i];
+	move_ligand(&calc_lig, orig_vec, orig_vec); //moving it according to grid location
+	scale_ligand(&calc_lig, 1.0/mygrid->spacing);
+	calc.interE = calc_interE_f(mygrid, &calc_lig, 0.0005, 0, calc.intraflexE, &(calc.interE_elec), calc.peratom_vdw, calc.peratom_elec); // calculate intermolecular and per atom energies
+	if (output_analysis){
+		calc.analysis = analyze_ligand_receptor(mygrid, &calc_lig, mypars->receptor_atoms.data(), mypars->receptor_map, mypars->receptor_map_list, 0.0005, 0, mypars->H_cutoff, mypars->V_cutoff);
 	}
-
-	// the best result is the center of the first cluster
-	(myresults [0]).clus_id = 1;
-	(myresults [0]).rmsd_from_cluscent = 0;
-	num_of_clusters = 1;
-
-	for (i=1; i<num_of_runs; i++) // for each result
-	{
-		current_clust_center = 0;
-		result_clustered = 0;
-		for (j=0; j<i; j++) // results with lower id-s are clustered, look for cluster centers
-		{
-			if ((myresults [j]).clus_id > current_clust_center) // it is the center of a new cluster
-			{
-				current_clust_center = (myresults [j]).clus_id;
-				temp_rmsd = calc_rmsd(&((myresults [j]).reslig_realcoord), &((myresults [i]).reslig_realcoord), mypars->handle_symmetry); // comparing current result with cluster center
-				if (temp_rmsd <= cluster_tolerance) // in this case we put result i to cluster with center j
-				{
-					(myresults [i]).clus_id = current_clust_center;
-					(myresults [i]).rmsd_from_cluscent = temp_rmsd;
-					result_clustered = 1;
-					break;
+	scale_ligand(&calc_lig, mygrid->spacing);
+	// the interaction between flex res and ligand is stored in accurate_interflexE
+	if(output_analysis)
+		calc.intraE = calc_intraE_f(&calc_lig, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, 0, calc.interflexE, &(calc.analysis), mypars->receptor_atoms.data() + mypars->nr_receptor_atoms, mypars->R_cutoff, mypars->H_cutoff, mypars->V_cutoff);
+	else
+		calc.intraE = calc_intraE_f(&calc_lig, 8, mypars->smooth, 0, mypars->elec_min_distance, tables, 0, calc.interflexE);
+	move_ligand(&calc_lig, mygrid->origo_real_xyz, mygrid->origo_real_xyz); //moving it according to grid location
+	if (output_analysis){
+		// sort by analysis type
+		for(unsigned int j=0; j<calc.analysis.size(); j++)
+			for(unsigned int k=0; k<calc.analysis.size()-j-1; k++)
+				if(calc.analysis[k].type>calc.analysis[k+1].type) // percolate larger types numbers up
+					std::swap(calc.analysis[k], calc.analysis[k+1]);
+		if(calc.analysis.size()>0){
+			fprintf(fp, "ANALYSIS: COUNT %lu\n", calc.analysis.size());
+			std::string types    = "TYPE    {";
+			std::string lig_id   = "LIGID   {";
+			std::string ligname  = "LIGNAME {";
+			std::string rec_id   = "RECID   {";
+			std::string rec_name = "RECNAME {";
+			std::string residue  = "RESIDUE {";
+			std::string res_id   = "RESID   {";
+			std::string chain    = "CHAIN   {";
+			char item[8], pad[8];
+			for(unsigned int j=0; j<calc.analysis.size(); j++){
+				if(j>0){
+					types    += ",";
+					lig_id   += ",";
+					ligname  += ",";
+					rec_id   += ",";
+					rec_name += ",";
+					residue  += ",";
+					res_id   += ",";
+					chain    += ",";
+				}
+				switch(calc.analysis[j].type){
+					case 0: types += "   \"R\"";
+					        break;
+					case 1: types += "   \"H\"";
+					        break;
+					default:
+					case 2: types += "   \"V\"";
+					        break;
 				}
+				sprintf(item, "%5d ", calc.analysis[j].lig_id);   lig_id+=item;
+				sprintf(item, "\"%s\"", calc.analysis[j].lig_name); sprintf(pad, "%6s", item); ligname+=pad;
+				sprintf(item, "%5d ", calc.analysis[j].rec_id);   rec_id+=item;
+				sprintf(item, "\"%s\"", calc.analysis[j].rec_name); sprintf(pad, "%6s", item); rec_name+=pad;
+				sprintf(item, "\"%s\"", calc.analysis[j].residue); sprintf(pad, "%6s", item);  residue+=pad;
+				sprintf(item, "%5d ", calc.analysis[j].res_id);   res_id+=item;
+				sprintf(item, "\"%s\"", calc.analysis[j].chain); sprintf(pad, "%6s", item);    chain+=pad;
 			}
-		}
-		if (result_clustered != 1) // if no suitable cluster was found, this is the center of a new one
-		{
-			num_of_clusters++;
-			(myresults [i]).clus_id = num_of_clusters; // new cluster id
-			(myresults [i]).rmsd_from_cluscent = 0;
+			fprintf(fp, "ANALYSIS: %s}\n", types.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", lig_id.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", ligname.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", rec_id.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", rec_name.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", residue.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n", res_id.c_str());
+			fprintf(fp, "ANALYSIS: %s}\n\n", chain.c_str());
 		}
 	}
-
-	for (i=1; i<=num_of_clusters; i++) // printing cluster info to file
-	{
-		subrank = 0;
-		cluster_sizes [i-1] = 0;
-		sum_energy [i-1] = 0;
-		for (j=0; j<num_of_runs; j++)
-			if (myresults [j].clus_id == i)
-			{
-				subrank++;
-				(cluster_sizes [i-1])++;
-				sum_energy [i-1] += (myresults [j]).interE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
-				(myresults [j]).clus_subrank = subrank;
-				if (subrank == 1)
-					best_energy [i-1] = (myresults [j]).interE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
-			}
-	}
-
-	fp = fopen(report_file_name, "w");
-
-	write_basic_info(fp, ligand_ref, mypars, mygrid, argc, argv); // Write basic information about docking and molecule parameters to file
-	fprintf(fp, "           RUN TIME INFO           \n");
-	fprintf(fp, "===================================\n\n");
-
-	fprintf(fp, "Average GPU run time for 1 run:           %lfs\n", docking_avg_runtime);
-	fprintf(fp, "Total GPU docking run time:               %fs\n", docking_avg_runtime*mypars->num_of_runs);
-
-	fprintf(fp, "Program run time:                          %lfs\n", program_runtime);
-	fprintf(fp, "\n\n");
-
-	fprintf(fp, "       CLUSTERING HISTOGRAM        \n");
-	fprintf(fp, "===================================\n\n");
-	fprintf(fp, " Cluster rank | Num in cluster |   Best energy   |   Mean energy   |    5    10   15   20   25   30   35\n");
-	fprintf(fp, "--------------+----------------+-----------------+-----------------+----+----+----+----+----+----+----+\n");
-
-	for (i=1; i<=num_of_clusters; i++)
-	{
-		fprintf(fp, "      %3d     |       %3d      | %15.3lf | %15.3lf |", i, cluster_sizes [i-1], best_energy [i-1], sum_energy [i-1]/cluster_sizes [i-1]);
-
-		for (j=0; j<cluster_sizes [i-1]; j++)
-			fprintf(fp, "#");
-
-		fprintf(fp, "\n");
+	if(output_energy){
+		double torsional_energy = mypars->coeffs.AD4_coeff_tors * calc_lig.true_ligand_rotbonds;
+		fprintf(fp, "%s    Estimated Free Energy of Binding    =", prefix);
+		PRINT1000(fp, ((float) (calc.interE + calc.interflexE + torsional_energy)));
+		fprintf(fp, " kcal/mol  [=(1)+(2)+(3)-(4)]\n");
+		fprintf(fp, "%s\n", prefix);
+		fprintf(fp, "%s    (1) Final Intermolecular Energy     =", prefix);
+		PRINT1000(fp, ((float) (calc.interE + calc.interflexE)));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s        vdW + Hbond + desolv Energy     =", prefix);
+		PRINT1000(fp, ((float) (calc.interE - calc.interE_elec)));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s        Electrostatic Energy            =", prefix);
+		PRINT1000(fp, ((float) calc.interE_elec));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s        Moving Ligand-Fixed Receptor    =", prefix);
+		PRINT1000(fp, ((float) calc.interE));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s        Moving Ligand-Moving Receptor   =", prefix);
+		PRINT1000(fp, ((float) calc.interflexE));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s    (2) Final Total Internal Energy     =", prefix);
+		PRINT1000(fp, ((float) (calc.intraE + calc.intraflexE)));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s    (3) Torsional Free Energy           =", prefix);
+		PRINT1000(fp, ((float) torsional_energy));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s    (4) Unbound System's Energy         =", prefix);
+		PRINT1000(fp, ((float) (calc.intraE + calc.intraflexE)));
+		fprintf(fp, " kcal/mol\n");
+		fprintf(fp, "%s\n", prefix);
 	}
-	fprintf(fp, "\n\n");
-
-	fprintf(fp, "              CLUSTERS             \n");
-	fprintf(fp, "===================================\n\n");
-	fprintf(fp, " Rank | Subrank | Run | Intermolecular E | Intramolecular E | Torsional energy |   Total energy   | Cluster RMSD | Reference RMSD |\n");
-	fprintf(fp, "------+---------+-----+------------------+------------------+------------------+------------------+--------------+----------------+\n");
-
-	for (i=1; i<=num_of_clusters; i++) // printing cluster info to file
-	{
-		for (j=0; j<num_of_runs; j++)
-			if (myresults [j].clus_id == i)
-			{
-				fprintf(fp, "  %3d |   %3d   | %3d |  %15.3lf |  %15.3lf |  %15.3lf |  %15.3lf |     %4.2lf     |      %4.2lf      |\n", (myresults [j]).clus_id, (myresults [j]).clus_subrank, (myresults [j]).run_number,
-				            (myresults [j]).interE, (myresults [j]).intraE, torsional_energy, (myresults [j]).interE + /*(myresults [j]).intraE +*/ torsional_energy, (myresults [j]).rmsd_from_cluscent, (myresults [j]).rmsd_from_ref); // intraE can be commented when unbound_same_as_bound
-			}
-	}
-	fclose(fp);
 }
 
-void clusanal_gendlg(
+void generate_output(
                            Ligandresult  myresults [],
                            int           num_of_runs,
-                     const Liganddata*   ligand_ref,
+                           IntraTables*  tables,
+                           Liganddata*   ligand_ref,
+                     const Liganddata*   ligand_xray,
                      const Dockpars*     mypars,
                      const Gridinfo*     mygrid,
                      const int*          argc,
@@ -630,14 +661,12 @@ void clusanal_gendlg(
 // will be written to it.
 {
 	int i, j, atom_cnt;
-	Ligandresult temp_ligres;
-	int num_of_clusters;
+	int num_of_clusters = 0;
 	int current_clust_center;
 	double temp_rmsd;
 	int result_clustered;
 	int subrank;
 	FILE* fp = stdout;
-	FILE* fp_orig;
 	FILE* fp_xml;
 	int cluster_sizes [1000];
 	double sum_energy [1000];
@@ -646,13 +675,12 @@ void clusanal_gendlg(
 	char tempstr [256];
 
 	double cluster_tolerance = mypars->rmsd_tolerance;
-	const double AD4_coeff_tors = mypars->coeffs.AD4_coeff_tors;
-	double torsional_energy;
 
 	// first of all, let's calculate the constant torsional free energy term
-	torsional_energy = AD4_coeff_tors * ligand_ref->true_ligand_rotbonds;
+	double torsional_energy = mypars->coeffs.AD4_coeff_tors * ligand_ref->true_ligand_rotbonds;
 
 	int len = strlen(mypars->resname) + 4 + 1;
+	
 	// GENERATING DLG FILE
 	if(mypars->output_dlg){
 		if(!mypars->dlg2stdout){
@@ -660,6 +688,10 @@ void clusanal_gendlg(
 			strcpy(report_file_name, mypars->resname);
 			strcat(report_file_name, ".dlg");
 			fp = fopen(report_file_name, "w");
+			if(fp==NULL){
+				printf("Error: Cannot create dlg output file %s: %s\n",report_file_name,strerror(errno));
+				exit(7);
+			}
 			free(report_file_name);
 		}
 
@@ -667,8 +699,8 @@ void clusanal_gendlg(
 		write_basic_info_dlg(fp, ligand_ref, mypars, mygrid, argc, argv);
 
 		if(!mypars->xml2dlg){
-			fprintf(fp, "           COUNTER STATES           \n");
-			fprintf(fp, "___________________________________\n\n");
+			fprintf(fp, "    COUNTER STATES\n");
+			fprintf(fp, "    ________________________\n\n\n");
 			fprintf(fp, "Number of energy evaluations performed:    %lu\n", evals_performed);
 			fprintf(fp, "Number of generations used:                %lu\n", generations_used);
 			fprintf(fp, "\n\n");
@@ -677,11 +709,30 @@ void clusanal_gendlg(
 		std::string pdbqt_template;
 		std::vector<unsigned int> atom_data;
 		char lineout [264];
+		bool output_ref_calcs = mypars->reflig_en_required;
+		if(mypars->given_xrayligandfile){
+			// writing xray ligand pdbqt file
+			fprintf(fp, "    XRAY LIGAND PDBQT FILE:\n");
+			fprintf(fp, "    ________________________\n\n\n");
+			ligand_calc_output(fp, "XRAY-LIGAND-PDBQT: USER", tables, ligand_xray, mypars, mygrid, mypars->contact_analysis, output_ref_calcs);
+			if(output_ref_calcs) output_ref_calcs=false;
+			unsigned int line_count = 0;
+			while (line_count < ligand_xray->ligand_line_count)
+			{
+				strcpy(tempstr,ligand_xray->file_content[line_count].c_str());
+				line_count++;
+				fprintf(fp, "XRAY-LIGAND-PDBQT: %s", tempstr);
+			}
+			fprintf(fp, "\n\n");
+		}
 		// writing input pdbqt file
 		fprintf(fp, "    INPUT LIGAND PDBQT FILE:\n    ________________________\n\n\n");
-		fp_orig = fopen(mypars->ligandfile, "rb"); // fp_orig = fopen(mypars->ligandfile, "r");
-		while (fgets(tempstr, 255, fp_orig) != NULL) // reading original ligand pdb line by line
+		ligand_calc_output(fp, "INPUT-LIGAND-PDBQT: USER", tables, ligand_ref, mypars, mygrid, mypars->contact_analysis, output_ref_calcs);
+		unsigned int line_count = 0;
+		while (line_count < ligand_ref->ligand_line_count)
 		{
+			strcpy(tempstr,ligand_ref->file_content[line_count].c_str());
+			line_count++;
 			fprintf(fp, "INPUT-LIGAND-PDBQT: %s", tempstr);
 			if ((strncmp("ATOM", tempstr, 4) == 0) || (strncmp("HETATM", tempstr, 6) == 0))
 			{
@@ -700,15 +751,15 @@ void clusanal_gendlg(
 			}
 		}
 		fprintf(fp, "\n\n");
-		fclose(fp_orig);
 		
 		// writing input flexres pdbqt file if specified
 		if (mypars->flexresfile!=NULL) {
 			if ( strlen(mypars->flexresfile)>0 ) {
 				fprintf(fp, "    INPUT FLEXRES PDBQT FILE:\n    ________________________\n\n\n");
-				fp_orig = fopen(mypars->flexresfile, "rb"); // fp_orig = fopen(mypars->flexresfile, "r");
-				while (fgets(tempstr, 255, fp_orig) != NULL) // reading original flexres pdb line by line
+				while (line_count < ligand_ref->file_content.size())
 				{
+					strcpy(tempstr,ligand_ref->file_content[line_count].c_str());
+					line_count++;
 					fprintf(fp, "INPUT-FLEXRES-PDBQT: %s", tempstr);
 					if ((strncmp("ATOM", tempstr, 4) == 0) || (strncmp("HETATM", tempstr, 6) == 0))
 					{
@@ -727,7 +778,6 @@ void clusanal_gendlg(
 					}
 				}
 				fprintf(fp, "\n\n");
-				fclose(fp_orig);
 			}
 		}
 		
@@ -742,15 +792,6 @@ void clusanal_gendlg(
 
 			if(mypars->contact_analysis){
 				if(myresults[i].analysis.size()>0){
-					// sort by analysis type
-					AnalysisData temp;
-					for(unsigned int j=0; j<myresults[i].analysis.size(); j++)
-						for(unsigned int k=0; k<myresults[i].analysis.size()-j-1; k++)
-							if(myresults[i].analysis[k].type>myresults[i].analysis[k+1].type){
-								temp = myresults[i].analysis[k];
-								myresults[i].analysis[k]   = myresults[i].analysis[k+1];
-								myresults[i].analysis[k+1] = temp;
-							}
 					fprintf(fp, "ANALYSIS: COUNT %lu\n", myresults[i].analysis.size());
 					std::string types    = "TYPE    {";
 					std::string lig_id   = "LIGID   {";
@@ -852,7 +893,7 @@ void clusanal_gendlg(
 					fprintf(fp, "DOCKED: USER    NEWDPF axisangle0 %.8f %.8f %.8f %.6f\n", sin(theta)*cos(phi), sin(theta)*sin(phi), cos(theta), myresults[i].genotype[5]);
 				} else fprintf(fp, "DOCKED: USER    NEWDPF axisangle0 %.8f %.8f %.8f %.6f\n", myresults[i].genotype[3], myresults[i].genotype[4], myresults[i].genotype[5], myresults[i].genotype[GENOTYPE_LENGTH_IN_GLOBMEM-1]);
 				fprintf(fp, "DOCKED: USER    NEWDPF dihe0");
-				for(j=0; j<myresults[i].reslig_realcoord.num_of_rotbonds; j++)
+				for(j=0; j<ligand_ref->num_of_rotbonds; j++)
 					fprintf(fp, " %.6f", myresults[i].genotype[6+j]);
 				fprintf(fp, "\n");
 			}
@@ -865,16 +906,18 @@ void clusanal_gendlg(
 			}
 			
 			curr_model = pdbqt_template;
-			for(atom_cnt = atom_data.size(); atom_cnt-->0;)
+			// inserting text from the end means prior text positions won't shift
+			// so there's less to keep track off ;-)
+			for(atom_cnt = ligand_ref->num_of_atoms; atom_cnt-->0;)
 			{
 				char* line = lineout;
-				line += sprintf(line, "%8.3lf", myresults[i].reslig_realcoord.atom_idxyzq[atom_cnt][1]); // x
-				line += sprintf(line, "%8.3lf", myresults[i].reslig_realcoord.atom_idxyzq[atom_cnt][2]); // y
-				line += sprintf(line, "%8.3lf", myresults[i].reslig_realcoord.atom_idxyzq[atom_cnt][3]); // z
+				line += sprintf(line, "%8.3lf", myresults[i].atom_idxyzq[atom_cnt][1]); // x
+				line += sprintf(line, "%8.3lf", myresults[i].atom_idxyzq[atom_cnt][2]); // y
+				line += sprintf(line, "%8.3lf", myresults[i].atom_idxyzq[atom_cnt][3]); // z
 				line += sprintf(line, "%+6.2lf", copysign(fmin(fabs(myresults[i].peratom_vdw[atom_cnt]),99.99),myresults[i].peratom_vdw[atom_cnt])); // vdw
 				line += sprintf(line, "%+6.2lf", copysign(fmin(fabs(myresults[i].peratom_elec[atom_cnt]),99.99),myresults[i].peratom_elec[atom_cnt])); // elec
-				line += sprintf(line, "    %+6.3lf ", myresults[i].reslig_realcoord.atom_idxyzq[atom_cnt][4]); // q
-				line += sprintf(line, "%-2s\n", myresults[i].reslig_realcoord.atom_types[((int) myresults[i].reslig_realcoord.atom_idxyzq[atom_cnt][0])]); // type
+				line += sprintf(line, "    %+6.3lf ", myresults[i].atom_idxyzq[atom_cnt][4]); // q
+				line += sprintf(line, "%-2s\n", ligand_ref->atom_types[((int)myresults[i].atom_idxyzq[atom_cnt][0])]); // type
 				curr_model.insert(atom_data[atom_cnt],lineout);
 			}
 			fprintf(fp, "%s", curr_model.c_str());
@@ -883,150 +926,158 @@ void clusanal_gendlg(
 			fprintf(fp, "________________________________________________________________________________\n\n\n");
 		}
 	}
-	// PERFORM CLUSTERING
-	// arranging results according to energy, myresults [0] will be the best one (with lowest energy)
-	for (j=0; j<num_of_runs-1; j++)
-		for (i=num_of_runs-2; i>=j; i--) // arrange according to sum of inter- and intramolecular energies
-			if ((myresults [i]).interE+myresults[i].interflexE /*+ (myresults [i]).intraE*/ > (myresults [i+1]).interE+myresults[i+1].interflexE /*+ (myresults [i+1]).intraE*/)	//mimics the behaviour of AD4 unbound_same_as_bound
-			//if ((myresults [i]).interE + (myresults [i]).intraE > (myresults [i+1]).interE + (myresults [i+1]).intraE)
-			{
-				temp_ligres = myresults [i];
-				myresults [i] = myresults [i+1];
-				myresults [i+1] = temp_ligres;
-			}
-
-	for (i=0; i<num_of_runs; i++)
-	{
-		(myresults [i]).clus_id = 0; // indicates that it hasn't been put into cluster yet
+	
+	// arranging results according to energy, myresults [energy_order[0]] will be the best one (with lowest energy)
+	std::vector<int> energy_order(num_of_runs);
+	std::vector<double> energies(num_of_runs);
+	for (i=0; i<num_of_runs; i++){
+		energy_order[i] = i;
+		energies[i] = myresults [i].interE+myresults[i].interflexE; // mimics the behaviour of AD4 unbound_same_as_bound
+		myresults[i].clus_id = 0; // indicates that it hasn't been put into cluster yet (may as well do that here ...)
 	}
+	// sorting the indices instead of copying the results around will be faster
+	for(i=0; i<num_of_runs-1; i++)
+		for(j=0; j<num_of_runs-i-1; j++)
+			if(energies[energy_order[j]]>energies[energy_order[j+1]]) // swap indices to percolate larger energies up
+				std::swap(energy_order[j], energy_order[j+1]);
+	// PERFORM CLUSTERING
+	if(mypars->calc_clustering){
 
-	// the best result is the center of the first cluster
-	(myresults [0]).clus_id = 1;
-	(myresults [0]).rmsd_from_cluscent = 0;
-	num_of_clusters = 1;
+		// the best result is the center of the first cluster
+		myresults[energy_order[0]].clus_id = 1;
+		myresults[energy_order[0]].rmsd_from_cluscent = 0;
+		num_of_clusters = 1;
 
-	for (i=1; i<num_of_runs; i++) // for each result
-	{
-		current_clust_center = 0;
-		result_clustered = 0;
-
-		for (j=0; j<i; j++) // results with lower id-s are clustered, look for cluster centers
+		for (int w=1; w<num_of_runs; w++) // for each result
 		{
-			if ((myresults [j]).clus_id > current_clust_center) // it is the center of a new cluster
+			i=energy_order[w];
+			current_clust_center = 0;
+			result_clustered = 0;
+
+			for (int u=0; u<w; u++) // results with lower id-s are clustered, look for cluster centers
 			{
-				current_clust_center = (myresults [j]).clus_id;
-				temp_rmsd = calc_rmsd(&((myresults [j]).reslig_realcoord), &((myresults [i]).reslig_realcoord), mypars->handle_symmetry); // comparing current result with cluster center
-				if (temp_rmsd <= cluster_tolerance) // in this case we put result i to cluster with center j
+				j=energy_order[u];
+				if (myresults[j].clus_id > current_clust_center) // it is the center of a new cluster
 				{
-					(myresults [i]).clus_id = current_clust_center;
-					(myresults [i]).rmsd_from_cluscent = temp_rmsd;
-					result_clustered = 1;
-					break;
+					current_clust_center = myresults[j].clus_id;
+					temp_rmsd = calc_rmsd(myresults[j].atom_idxyzq, myresults[i].atom_idxyzq, ligand_ref->true_ligand_atoms, mypars->handle_symmetry); // comparing current result with cluster center
+					if (temp_rmsd <= cluster_tolerance) // in this case we put result i to cluster with center j
+					{
+						myresults[i].clus_id = current_clust_center;
+						myresults[i].rmsd_from_cluscent = temp_rmsd;
+						result_clustered = 1;
+						break;
+					}
 				}
 			}
-		}
 
-		if (result_clustered != 1) // if no suitable cluster was found, this is the center of a new one
-		{
-			num_of_clusters++;
-			(myresults [i]).clus_id = num_of_clusters; // new cluster id
-			(myresults [i]).rmsd_from_cluscent = 0;
+			if (result_clustered != 1) // if no suitable cluster was found, this is the center of a new one
+			{
+				num_of_clusters++;
+				myresults[i].clus_id = num_of_clusters; // new cluster id
+				myresults[i].rmsd_from_cluscent = 0;
+			}
 		}
-	}
 
-	for (i=1; i<=num_of_clusters; i++) // printing cluster info to file
-	{
-		subrank = 0;
-		cluster_sizes [i-1] = 0;
-		sum_energy [i-1] = 0;
-		for (j=0; j<num_of_runs; j++)
-			if (myresults [j].clus_id == i)
-			{
-				subrank++;
-				(cluster_sizes [i-1])++;
-				sum_energy [i-1] += (myresults [j]).interE + myresults[j].interflexE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
-				(myresults [j]).clus_subrank = subrank;
-				if (subrank == 1)
+		for (i=1; i<=num_of_clusters; i++) // printing cluster info to file
+		{
+			subrank = 0;
+			cluster_sizes [i-1] = 0;
+			sum_energy [i-1] = 0;
+			for (int u=0; u<num_of_runs; u++){
+				j = energy_order[u];
+				if (myresults [j].clus_id == i)
 				{
-					best_energy [i-1] = (myresults [j]).interE + myresults[j].interflexE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
-					best_energy_runid  [i-1] = (myresults [j]).run_number;
+					subrank++;
+					cluster_sizes[i-1]++;
+					sum_energy [i-1] += myresults[j].interE + myresults[j].interflexE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
+					myresults[j].clus_subrank = subrank;
+					if (subrank == 1)
+					{
+						best_energy [i-1] = myresults[j].interE + myresults[j].interflexE + /*(myresults [j]).intraE +*/ torsional_energy; // intraE can be commented when unbound_same_as_bound
+						best_energy_runid  [i-1] = myresults[j].run_number;
+					}
 				}
 			}
-	}
+		}
 
-	if(mypars->output_dlg){
-		// WRITING CLUSTER INFORMATION
-		fprintf(fp, "    CLUSTERING HISTOGRAM\n    ____________________\n\n\n");
-		fprintf(fp, "________________________________________________________________________________\n");
-		fprintf(fp, "     |           |     |           |     |\n");
-		fprintf(fp, "Clus | Lowest    | Run | Mean      | Num | Histogram\n");
-		fprintf(fp, "-ter | Binding   |     | Binding   | in  |\n");
-		fprintf(fp, "Rank | Energy    |     | Energy    | Clus|    5    10   15   20   25   30   35\n");
-		fprintf(fp, "_____|___________|_____|___________|_____|____:____|____:____|____:____|____:___\n");
-
-		for (i=0; i<num_of_clusters; i++)
-		{
-			fprintf(fp, "%4d |", i+1);
+		if(mypars->output_dlg){
+			// WRITING CLUSTER INFORMATION
+			fprintf(fp, "    CLUSTERING HISTOGRAM\n    ____________________\n\n\n");
+			fprintf(fp, "________________________________________________________________________________\n");
+			fprintf(fp, "     |           |     |           |     |\n");
+			fprintf(fp, "Clus | Lowest    | Run | Mean      | Num | Histogram\n");
+			fprintf(fp, "-ter | Binding   |     | Binding   | in  |\n");
+			fprintf(fp, "Rank | Energy    |     | Energy    | Clus|    5    10   15   20   25   30   35\n");
+			fprintf(fp, "_____|___________|_____|___________|_____|____:____|____:____|____:____|____:___\n");
+
+			for (i=0; i<num_of_clusters; i++)
+			{
+				fprintf(fp, "%4d |", i+1);
 
-			if (best_energy[i] > 999999.99)
-				fprintf(fp, "%+10.2e", best_energy[i]);
-			else
-				fprintf(fp, "%+10.2f", best_energy[i]);
-			fprintf(fp, " |%4d |", best_energy_runid[i]);
+				if (best_energy[i] > 999999.99)
+					fprintf(fp, "%+10.2e", best_energy[i]);
+				else
+					fprintf(fp, "%+10.2f", best_energy[i]);
+				fprintf(fp, " |%4d |", best_energy_runid[i]);
 
-			if (sum_energy[i]/cluster_sizes[i] > 999999.99)
-				fprintf(fp, "%+10.2e |", sum_energy[i]/cluster_sizes[i]);
-			else
-				fprintf(fp, "%+10.2f |", sum_energy[i]/cluster_sizes[i]);
+				if (sum_energy[i]/cluster_sizes[i] > 999999.99)
+					fprintf(fp, "%+10.2e |", sum_energy[i]/cluster_sizes[i]);
+				else
+					fprintf(fp, "%+10.2f |", sum_energy[i]/cluster_sizes[i]);
 
-			fprintf(fp, "%4d |", cluster_sizes [i]);
+				fprintf(fp, "%4d |", cluster_sizes [i]);
 
-			for (j=0; j<cluster_sizes [i]; j++)
-				fprintf(fp, "#");
+				for (j=0; j<cluster_sizes [i]; j++)
+					fprintf(fp, "#");
 
-			fprintf(fp, "\n");
-		}
+				fprintf(fp, "\n");
+			}
 
-		fprintf(fp, "_____|___________|_____|___________|_____|______________________________________\n\n\n");
+			fprintf(fp, "_____|___________|_____|___________|_____|______________________________________\n\n\n");
 
-		// writing RMSD table
+			// writing RMSD table
 
-		fprintf(fp, "    RMSD TABLE\n");
-		fprintf(fp, "    __________\n\n\n");
+			fprintf(fp, "    RMSD TABLE\n");
+			fprintf(fp, "    __________\n\n\n");
 
-		fprintf(fp, "_____________________________________________________________________\n");
-		fprintf(fp, "     |      |      |           |         |                 |\n");
-		fprintf(fp, "Rank | Sub- | Run  | Binding   | Cluster | Reference       | Grep\n");
-		fprintf(fp, "     | Rank |      | Energy    | RMSD    | RMSD            | Pattern\n");
-		fprintf(fp, "_____|______|______|___________|_________|_________________|___________\n" );
+			fprintf(fp, "_____________________________________________________________________\n");
+			fprintf(fp, "     |      |      |           |         |                 |\n");
+			fprintf(fp, "Rank | Sub- | Run  | Binding   | Cluster | Reference       | Grep\n");
+			fprintf(fp, "     | Rank |      | Energy    | RMSD    | RMSD            | Pattern\n");
+			fprintf(fp, "_____|______|______|___________|_________|_________________|___________\n" );
 
-		for (i=0; i<num_of_clusters; i++) // printing cluster info to file
-		{
-			for (j=0; j<num_of_runs; j++)
-				if (myresults [j].clus_id == i+1) {
-					if (myresults[j].interE + myresults[j].interflexE + torsional_energy > 999999.99)
-						fprintf(fp, "%4d   %4d   %4d  %+10.2e  %8.2f  %8.2f           RANKING\n",
-						             (myresults [j]).clus_id,
-						                   (myresults [j]).clus_subrank,
-						                         (myresults [j]).run_number,
-						                              myresults[j].interE + myresults[j].interflexE + torsional_energy,
-						                                       (myresults [j]).rmsd_from_cluscent,
-						                                              (myresults [j]).rmsd_from_ref);
-					else
-						fprintf(fp, "%4d   %4d   %4d  %+10.2f  %8.2f  %8.2f           RANKING\n",
-						             (myresults [j]).clus_id,
-						                   (myresults [j]).clus_subrank,
-						                         (myresults [j]).run_number,
-						                              myresults[j].interE + myresults[j].interflexE + torsional_energy,
-						                                       (myresults [j]).rmsd_from_cluscent,
-						                                              (myresults [j]).rmsd_from_ref);
+			for (i=0; i<num_of_clusters; i++) // printing cluster info to file
+			{
+				for (int u=0; u<num_of_runs; u++){
+					j = energy_order[u];
+					if (myresults[j].clus_id == i+1) {
+						if (myresults[j].interE + myresults[j].interflexE + torsional_energy > 999999.99)
+							fprintf(fp, "%4d   %4d   %4d  %+10.2e  %8.2f  %8.2f           RANKING\n",
+							             myresults[j].clus_id,
+							                   myresults[j].clus_subrank,
+							                         myresults[j].run_number,
+							                              myresults[j].interE + myresults[j].interflexE + torsional_energy,
+							                                       myresults[j].rmsd_from_cluscent,
+							                                              myresults[j].rmsd_from_ref);
+						else
+							fprintf(fp, "%4d   %4d   %4d  %+10.2f  %8.2f  %8.2f           RANKING\n",
+							             myresults[j].clus_id,
+							                   myresults[j].clus_subrank,
+							                         myresults[j].run_number,
+							                              myresults[j].interE + myresults[j].interflexE + torsional_energy,
+							                                       myresults[j].rmsd_from_cluscent,
+							                                              myresults[j].rmsd_from_ref);
+					}
 				}
+			}
 		}
-
+	}
+	
+	if(mypars->output_dlg){
 		// Add execution and idle time information
 		fprintf(fp, "\nRun time %.3f sec", exec_time);
 		fprintf(fp, "\nIdle time %.3f sec\n", idle_time);
-
 		if(!mypars->dlg2stdout){
 			fclose(fp);
 		}
@@ -1039,14 +1090,31 @@ void clusanal_gendlg(
 		strcpy(xml_file_name, mypars->resname);
 		strcat(xml_file_name, ".xml");
 		fp_xml = fopen(xml_file_name, "w");
+		if(fp==NULL){
+			printf("Error: Cannot create xml output file %s: %s\n",xml_file_name,strerror(errno));
+			exit(9);
+		}
 
 		fprintf(fp_xml, "<?xml version=\"1.0\" ?>\n");
 		fprintf(fp_xml, "<autodock_gpu>\n");
 		fprintf(fp_xml, "\t<version>%s</version>\n",VERSION);
 		if((*argc)>1){
 			fprintf(fp_xml, "\t<arguments>");
-			for(i=1; i<(*argc); i++)
+			for(i=1; i<(*argc); i++){
 				fprintf(fp_xml, "%s%s", (i>1)?" ":"", argv[i]);
+				if (argcmp("filelist", argv[i], 'B')){
+					if(mypars->filelist_files>1){
+						fprintf(fp_xml, " %s", mypars->ligandfile);
+						i+=mypars->filelist_files; // skip ahead in case there are multiple entries here
+					}
+				}
+				if (argcmp("xml2dlg", argv[i], 'X')){
+					if(mypars->xml_files>1){
+						fprintf(fp_xml, " %s", mypars->load_xml);
+						i+=mypars->xml_files; // skip ahead in case there are multiple entries here
+					}
+				}
+			}
 			fprintf(fp_xml, "</arguments>\n");
 		}
 		if(mypars->dpffile)
@@ -1070,19 +1138,11 @@ void clusanal_gendlg(
 		fprintf(fp_xml, "\t<run_requested>%lu</run_requested>\n",mypars->num_of_runs);
 		fprintf(fp_xml, "\t<runs>\n");
 		double phi, theta;
-		for(j=0; j<num_of_runs; j++){
+		for(int u=0; u<num_of_runs; u++){
+			j = energy_order[u];
 			fprintf(fp_xml, "\t\t<run id=\"%d\">\n",(myresults [j]).run_number);
 			if(mypars->contact_analysis){
 				if(myresults[j].analysis.size()>0){
-					// sort by analysis type
-					AnalysisData temp;
-					for(unsigned int i=0; i<myresults[j].analysis.size(); i++)
-						for(unsigned int k=0; k<myresults[j].analysis.size()-i-1; k++)
-							if(myresults[j].analysis[k].type>myresults[j].analysis[k+1].type){
-								temp = myresults[j].analysis[k];
-								myresults[j].analysis[k]   = myresults[j].analysis[k+1];
-								myresults[j].analysis[k+1] = temp;
-							}
 					fprintf(fp_xml, "\t\t\t<contact_analysis count=\"%lu\">\n", myresults[j].analysis.size());
 					std::string types;
 					std::string lig_id;
@@ -1140,36 +1200,39 @@ void clusanal_gendlg(
 			phi = myresults[j].genotype[3]/180.0*PI;
 			theta = myresults[j].genotype[4]/180.0*PI;
 			fprintf(fp_xml, "\t\t\t<axisangle0>%.8f %.8f %.8f %.6f</axisangle0>\n", sin(theta)*cos(phi), sin(theta)*sin(phi), cos(theta), myresults[j].genotype[5]);
-			fprintf(fp_xml, "\t\t\t<ndihe>%d</ndihe>\n", myresults[j].reslig_realcoord.num_of_rotbonds);
+			fprintf(fp_xml, "\t\t\t<ndihe>%d</ndihe>\n", ligand_ref->num_of_rotbonds);
 			fprintf(fp_xml, "\t\t\t<dihe0>");
-			for(i=0; i<myresults[j].reslig_realcoord.num_of_rotbonds; i++)
+			for(i=0; i<ligand_ref->num_of_rotbonds; i++)
 				fprintf(fp_xml, "%s%.6f", (i>0)?" ":"", myresults[j].genotype[6+i]);
 			fprintf(fp_xml, "\n\t\t\t</dihe0>\n");
 			fprintf(fp_xml, "\t\t</run>\n");
 		}
 		fprintf(fp_xml, "\t</runs>\n");
-		fprintf(fp_xml, "\t<result>\n");
-		
-		fprintf(fp_xml, "\t\t<clustering_histogram>\n");
-		for (i=0; i<num_of_clusters; i++)
-		{
-			fprintf(fp_xml, "\t\t\t<cluster cluster_rank=\"%d\" lowest_binding_energy=\"%.2lf\" run=\"%d\" mean_binding_energy=\"%.2lf\" num_in_clus=\"%d\" />\n",
+		if(mypars->calc_clustering){
+			fprintf(fp_xml, "\t<result>\n");
+			fprintf(fp_xml, "\t\t<clustering_histogram>\n");
+			for (i=0; i<num_of_clusters; i++)
+			{
+				fprintf(fp_xml, "\t\t\t<cluster cluster_rank=\"%d\" lowest_binding_energy=\"%.2lf\" run=\"%d\" mean_binding_energy=\"%.2lf\" num_in_clus=\"%d\" />\n",
 					i+1, best_energy[i], best_energy_runid[i], sum_energy[i]/cluster_sizes[i], cluster_sizes [i]);
-		}
-		fprintf(fp_xml, "\t\t</clustering_histogram>\n");
-		
-		fprintf(fp_xml, "\t\t<rmsd_table>\n");
-		for (i=0; i<num_of_clusters; i++)
-		{
-			for (j=0; j<num_of_runs; j++)
-				if (myresults [j].clus_id == i+1)
-				{
-					fprintf(fp_xml, "\t\t\t<run rank=\"%d\" sub_rank=\"%d\" run=\"%d\" binding_energy=\"%.2lf\" cluster_rmsd=\"%.2lf\" reference_rmsd=\"%.2lf\" />\n",
-					                     (myresults [j]).clus_id, (myresults [j]).clus_subrank, (myresults [j]).run_number, myresults[j].interE + myresults[j].interflexE + torsional_energy, (myresults [j]).rmsd_from_cluscent, (myresults [j]).rmsd_from_ref);
+			}
+			fprintf(fp_xml, "\t\t</clustering_histogram>\n");
+			
+			fprintf(fp_xml, "\t\t<rmsd_table>\n");
+			for (i=0; i<num_of_clusters; i++)
+			{
+				for (int u=0; u<num_of_runs; u++){
+					j = energy_order[u];
+					if (myresults[j].clus_id == i+1)
+					{
+						fprintf(fp_xml, "\t\t\t<run rank=\"%d\" sub_rank=\"%d\" run=\"%d\" binding_energy=\"%.2lf\" cluster_rmsd=\"%.2lf\" reference_rmsd=\"%.2lf\" />\n",
+							myresults[j].clus_id, myresults[j].clus_subrank, myresults[j].run_number, myresults[j].interE + myresults[j].interflexE + torsional_energy, myresults[j].rmsd_from_cluscent, myresults[j].rmsd_from_ref);
+					}
 				}
+			}
+			fprintf(fp_xml, "\t\t</rmsd_table>\n");
+			fprintf(fp_xml, "\t</result>\n");
 		}
-		fprintf(fp_xml, "\t\t</rmsd_table>\n");
-		fprintf(fp_xml, "\t</result>\n");
 		fprintf(fp_xml, "</autodock_gpu>\n");
 		fclose(fp_xml);
 		free(xml_file_name);
@@ -1178,9 +1241,8 @@ void clusanal_gendlg(
 
 void process_result(
                     const Gridinfo*        mygrid,
-                    const float*           cpu_floatgrids,
                     const Dockpars*        mypars,
-                    const Liganddata*      myligand_init,
+                          Liganddata*      myligand_init,
                     const Liganddata*      myxrayligand,
                     const int*             argc,
                           char**           argv,
@@ -1191,11 +1253,13 @@ void process_result(
 
 	// Fill in cpu_result_ligands
 	float best_energy_of_all = 1000000000000.0;
+	IntraTables tables(&(sim_state.myligand_reference), mypars->coeffs.scaled_AD4_coeff_elec, mypars->coeffs.AD4_coeff_desolv, mypars->qasp, mypars->nr_mod_atype_pairs, mypars->mod_atype_pairs);
 	for (unsigned long run_cnt=0; run_cnt < mypars->num_of_runs; run_cnt++)
 	{
 		arrange_result(sim_state.cpu_populations.data()+run_cnt*mypars->pop_size*GENOTYPE_LENGTH_IN_GLOBMEM, sim_state.cpu_energies.data()+run_cnt*mypars->pop_size, mypars->pop_size);
 		make_resfiles(sim_state.cpu_populations.data()+run_cnt*mypars->pop_size*GENOTYPE_LENGTH_IN_GLOBMEM,
 		              sim_state.cpu_energies.data()+run_cnt*mypars->pop_size,
+		              &tables,
 		              &(sim_state.myligand_reference),
 		              myligand_init,
 		              myxrayligand,
@@ -1203,7 +1267,6 @@ void process_result(
 		              sim_state.cpu_evals_of_runs[run_cnt],
 		              sim_state.generation_cnt,
 		              mygrid,
-		              cpu_floatgrids,
 		              argc,
 		              argv,
 		              /*1*/0,
@@ -1212,10 +1275,12 @@ void process_result(
 		              &(cpu_result_ligands [run_cnt]));
 	}
 
-	// Do clustering analysis and generate dlg file
-	clusanal_gendlg(cpu_result_ligands.data(),
+	// Do analyses and generate dlg or xml output files
+	generate_output(cpu_result_ligands.data(),
 	                mypars->num_of_runs,
+	                &tables,
 	                myligand_init,
+	                myxrayligand,
 	                mypars,
 	                mygrid,
 	                argc,
diff --git a/host/src/setup.cpp b/host/src/setup.cpp
index 627632b0..c30376d6 100644
--- a/host/src/setup.cpp
+++ b/host/src/setup.cpp
@@ -26,33 +26,41 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
+#include <sys/stat.h>
 
 #include "filelist.hpp"
 #include "processgrid.h"
 #include "processligand.h"
+#include "processresult.h"
 #include "getparameters.h"
 #include "setup.hpp"
 
-int preload_gridsize(FileList& filelist)
+int preallocated_gridsize(FileList& filelist)
 {
 	if(!filelist.used) return 0;
 	int gridsize=0;
-	for(unsigned int i_file=0; i_file<filelist.fld_files.size(); i_file++){
-		// Filling mygrid according to the gpf file
-		if (get_gridinfo(filelist.fld_files[i_file].c_str(), &filelist.mygrids[i_file]) != 0)
-			{printf("\n\nError in get_gridinfo, stopped job."); return 1;}
-		int curr_size = 4*filelist.mygrids[i_file].size_xyz[0]*filelist.mygrids[i_file].size_xyz[1]*filelist.mygrids[i_file].size_xyz[2];
-		if(curr_size>gridsize)
-			gridsize=curr_size;
+	for(unsigned int i=0; i<filelist.fld_files.size(); i++){
+		size_t grid_idx = filelist.fld_files[i].grid_idx;
+		if(grid_idx<filelist.mygrids.size()){
+			// Filling mygrid according to the gpf file
+			if (get_gridinfo(filelist.fld_files[i].name.c_str(), &filelist.mygrids[grid_idx]) != 0){
+				printf("\n\nError in get_gridinfo, stopped job.");
+				return 1;
+			}
+			int curr_size = 4*filelist.mygrids[grid_idx].size_xyz[0]*
+			                  filelist.mygrids[grid_idx].size_xyz[1]*
+			                  filelist.mygrids[grid_idx].size_xyz[2]*
+			                  filelist.mygrids[grid_idx].grid_mapping.size();
+			if(curr_size>gridsize)
+				gridsize=curr_size;
+		}
 	}
 	return gridsize;
 }
 
 int setup(
-          std::vector<Map>&   all_maps,
-          Gridinfo&           mygrid,
-          std::vector<float>& floatgrids,
-          Dockpars&           mypars,
+          Gridinfo*           mygrid,
+          Dockpars*           mypars,
           Liganddata&         myligand_init,
           Liganddata&         myxrayligand,
           FileList&           filelist,
@@ -62,37 +70,30 @@ int setup(
          )
 {
 	// Filling the filename and coeffs fields of mypars according to command line arguments
-	if (get_filenames_and_ADcoeffs(&argc, argv, &mypars, filelist.used) != 0)
-		{printf("\n\nError in get_filenames_and_ADcoeffs, stopped job."); return 1;}
+	if (get_filenames_and_ADcoeffs(&argc, argv, mypars, filelist.used) != 0){
+		printf("\nError in get_filenames_and_ADcoeffs, stopped job.\n");
+		return 1;
+	}
 
 	//------------------------------------------------------------
-	// Testing command line arguments for cgmaps parameter,
-	// for derived atom types, and modified atom type pairs
+	// Testing command line arguments for xml2dlg mode,
+	// derived atom types, and modified atom type pairs
 	// since they will be needed at ligand and grid creation
 	//------------------------------------------------------------
 	for (int i=1; i<argc-1; i+=2)
 	{
+		if (argcmp("filelist", argv[i], 'B'))
+			i+=mypars->filelist_files-1; // skip ahead in case there are multiple entries here
+
 		if (argcmp("xml2dlg", argv[i], 'X'))
-			i+=mypars.xml_files-1; // skip ahead in case there are multiple entries here
+			i+=mypars->xml_files-1; // skip ahead in case there are multiple entries here
 
-		// ----------------------------------
-		// Argument: Use individual maps for CG-G0 instead of the same one
-		if (argcmp("cgmaps", argv [i]))
-		{
-			int tempint;
-			sscanf(argv [i+1], "%d", &tempint);
-			if (tempint == 0)
-				mypars.cgmaps = 0;
-			else
-				mypars.cgmaps = 1;
-		}
-		// ----------------------------------
 		// Argument: derivate atom types
 		if (argcmp("derivtype", argv [i], 'T'))
 		{
-			if(mypars.nr_deriv_atypes==0){
-				mypars.deriv_atypes=(deriv_atype*)malloc(sizeof(deriv_atype));
-				if(mypars.deriv_atypes==NULL){
+			if(mypars->nr_deriv_atypes==0){
+				mypars->deriv_atypes=(deriv_atype*)malloc(sizeof(deriv_atype));
+				if(mypars->deriv_atypes==NULL){
 					printf("Error: Cannot allocate memory for --derivtype (-T).\n");
 					exit(1);
 				}
@@ -103,7 +104,8 @@ int setup(
 			while(success && (*tmp!='\0')){
 				bool base_exists=false;
 				char* start_block=tmp;
-				int nr_start=mypars.nr_deriv_atypes;
+				int nr_start=mypars->nr_deriv_atypes;
+				int redefine=0;
 				// count nr of derivative atom types first
 				while((*tmp!='\0') && (*tmp!='/')){ // do one block at a time
 					if(*(tmp++)==','){ // this works here as the first character is not a ','
@@ -113,11 +115,15 @@ int setup(
 							break;
 						}
 						if(tmp-start_block-1>0){ // make sure there is a name (at least one char, we'll test later if it's taken already)
-							if(!add_deriv_atype(&mypars,start_block,tmp-start_block-1)){
+							int idx = add_deriv_atype(mypars,start_block,tmp-start_block-1,true);
+							if(idx==0){
 								printf("Error in --derivtype (-T) %s: derivative names can only be upto 3 characters long.\n",argv[i+1]);
 								success=false;
 								break;
 							}
+							// err out for first case of redefined doublets
+							// - need to wait for the base type to be certain
+							if(idx<0 && (redefine==0)) redefine=idx;
 							start_block=tmp;
 						} else{
 							printf("Error in --derivtype (-T) %s: derivative names have to be at least one character long.\n",argv[i+1]);
@@ -127,11 +133,15 @@ int setup(
 					}
 					if((*tmp=='=') && ((*(tmp+1)!='\0') || (*(tmp+1)!='/'))){
 						if(tmp-start_block>0){ // make sure there is a name (at least one char, we'll test later if it's taken already)
-							if(!add_deriv_atype(&mypars,start_block,tmp-start_block)){
+							int idx = add_deriv_atype(mypars,start_block,tmp-start_block,true);
+							if(idx==0){
 								printf("Error in --derivtype (-T) %s: derivative names can only be upto 3 characters long.\n",argv[i+1]);
 								success=false;
 								break;
 							}
+							// err out for first case of redefined doublets
+							// - need to wait for the base type to be certain
+							if(idx<0 && (redefine==0)) redefine=idx;
 						} else{
 							printf("Error in --derivtype (-T) %s: derivative names have to be at least one character long.\n",argv[i+1]);
 							success=false;
@@ -141,18 +151,26 @@ int setup(
 						base_exists=true;
 					}
 				}
-				for(int idx=nr_start; idx<mypars.nr_deriv_atypes; idx++){
-					int length=tmp-start_block;
-					if(length<4){
-						strncpy(mypars.deriv_atypes[idx].base_name,start_block,length);
-						mypars.deriv_atypes[idx].base_name[length]='\0';
-					} else{
-						printf("Error in --derivtype (-T) %s: base names can only be upto 3 characters long.\n",argv[i+1]);
+				int length=tmp-start_block;
+				if(length>=4){
+					printf("Error in --derivtype (-T) %s: base names can only be upto 3 characters long.\n",argv[i+1]);
+					success=false;
+					break;
+				}
+				if(redefine!=0){
+					redefine++; // redefine is -i-1 in case of doublet -> i = -(redefine+1)
+					redefine=-redefine;
+					if(strncmp(start_block,mypars->deriv_atypes[redefine].base_name,length)!=0){
+						printf("Error in --derivtype (-T) %s: redefinition of type %s with different base name.\n",argv[i+1],mypars->deriv_atypes[redefine].deriv_name);
 						success=false;
 						break;
 					}
+				}
+				for(int idx=nr_start; idx<mypars->nr_deriv_atypes; idx++){
+					strncpy(mypars->deriv_atypes[idx].base_name,start_block,length);
+					mypars->deriv_atypes[idx].base_name[length]='\0';
 #ifdef DERIVTYPE_INFO
-					printf("%i: %s=%s\n",mypars.deriv_atypes[idx].nr,mypars.deriv_atypes[idx].deriv_name,mypars.deriv_atypes[idx].base_name);
+					printf("%i: %s=%s\n",mypars->deriv_atypes[idx].nr,mypars->deriv_atypes[idx].deriv_name,mypars->deriv_atypes[idx].base_name);
 #endif
 				}
 				if(*tmp=='/') // need to go to next char otherwise the two loops will infinite loop (ask me how I knooooooooooooooooooooooooooooooooooooooo
@@ -171,16 +189,16 @@ int setup(
 			char* tmp=argv[i+1];
 			
 			while(success && (*tmp!='\0')){
-				mypars.nr_mod_atype_pairs++;
-				if(mypars.nr_mod_atype_pairs==1)
-					mypars.mod_atype_pairs=(pair_mod*)malloc(sizeof(pair_mod));
+				mypars->nr_mod_atype_pairs++;
+				if(mypars->nr_mod_atype_pairs==1)
+					mypars->mod_atype_pairs=(pair_mod*)malloc(sizeof(pair_mod));
 				else
-					mypars.mod_atype_pairs=(pair_mod*)realloc(mypars.mod_atype_pairs, mypars.nr_mod_atype_pairs*sizeof(pair_mod));
-				if(mypars.mod_atype_pairs==NULL){
+					mypars->mod_atype_pairs=(pair_mod*)realloc(mypars->mod_atype_pairs, mypars->nr_mod_atype_pairs*sizeof(pair_mod));
+				if(mypars->mod_atype_pairs==NULL){
 					printf("Error: Cannot allocate memory for --modpair (-P).\n");
 					exit(1);
 				}
-				pair_mod* curr_pair=&mypars.mod_atype_pairs[mypars.nr_mod_atype_pairs-1];
+				pair_mod* curr_pair=&mypars->mod_atype_pairs[mypars->nr_mod_atype_pairs-1];
 				// find atom type pair to modify
 				char* first_comma=strchr(tmp,',');
 				if(first_comma==NULL){
@@ -242,7 +260,7 @@ int setup(
 				if(*tmp=='/') // need to go to next char otherwise the two loops will infinite loop (ask me how I knooooooooooooooooooooooooooooooooooooooo
 					tmp++;
 #ifdef MODPAIR_INFO
-				printf("%i: %s:%s",mypars.nr_mod_atype_pairs,curr_pair->A,curr_pair->B);
+				printf("%i: %s:%s",mypars->nr_mod_atype_pairs,curr_pair->A,curr_pair->B);
 				for(unsigned int idx=0; idx<curr_pair->nr_parameters; idx++)
 					printf(",%f",curr_pair->parameters[idx]);
 				printf("\n");
@@ -260,167 +278,137 @@ int setup(
 	//------------------------------------------------------------
 
 	// Filling mygrid according to the fld file
-	if (get_gridinfo(mypars.fldfile, &mygrid) != 0)
+	if (get_gridinfo(mypars->fldfile, mygrid) != 0)
 	{
-		printf("\n\nError in get_gridinfo, stopped job.");
+		printf("\nError in get_gridinfo, stopped job.\n");
 		return 1;
 	}
 
 	// Filling the atom types field of myligand according to the grid types
-	if (init_liganddata(mypars.ligandfile,
-	                    mypars.flexresfile,
+	if (init_liganddata(mypars->ligandfile,
+	                    mypars->flexresfile,
 	                    &myligand_init,
-	                    &mygrid,
-	                    mypars.nr_deriv_atypes,
-	                    mypars.deriv_atypes,
-	                    mypars.cgmaps) != 0)
+	                    mygrid,
+	                    mypars->nr_deriv_atypes,
+	                    mypars->deriv_atypes) != 0)
 	{
-		printf("\n\nError in init_liganddata, stopped job.");
+		printf("\nError in init_liganddata, stopped job.\n");
 		return 1;
 	}
 
 	// Filling myligand according to the pdbqt file
-	if (get_liganddata(mypars.ligandfile,
-	                   mypars.flexresfile,
-	                   &myligand_init,
-	                   mypars.coeffs.AD4_coeff_vdW,
-	                   mypars.coeffs.AD4_coeff_hb,
-	                   mypars.nr_deriv_atypes,
-	                   mypars.deriv_atypes,
-	                   mypars.nr_mod_atype_pairs,
-	                   mypars.mod_atype_pairs) != 0)
+	if (parse_liganddata(&myligand_init,
+	                     mygrid,
+	                     mypars->coeffs.AD4_coeff_vdW,
+	                     mypars->coeffs.AD4_coeff_hb,
+	                     mypars->nr_deriv_atypes,
+	                     mypars->deriv_atypes,
+	                     mypars->nr_mod_atype_pairs,
+	                     mypars->mod_atype_pairs) != 0)
 	{
-		printf("\n\nError in get_liganddata, stopped job.");
+		printf("\nError in parse_liganddata, stopped job.\n");
 		return 1;
 	}
 
-	// Adding receptor atom information needed for analysis
-	if (mypars.contact_analysis && (mypars.flexresfile!=NULL)){
-		std::vector<ReceptorAtom> flexresatoms = read_receptor_atoms(mypars.flexresfile);
-		mypars.receptor_atoms.insert(mypars.receptor_atoms.end(), flexresatoms.begin(), flexresatoms.end());
-		for(int i=myligand_init.true_ligand_atoms; i<myligand_init.num_of_atoms; i++){
-			mypars.receptor_atoms[mypars.nr_receptor_atoms+i-myligand_init.true_ligand_atoms].acceptor=myligand_init.acceptor[i];
-			mypars.receptor_atoms[mypars.nr_receptor_atoms+i-myligand_init.true_ligand_atoms].donor=myligand_init.donor[i];
+	if (mypars->contact_analysis){
+		// read receptor in case contact analysis is requested and we haven't done so already
+		if(!filelist.preload_maps){
+			std::string receptor_name=mygrid->grid_file_path;
+			if(mygrid->grid_file_path.size()>0) receptor_name+="/";
+			receptor_name += mygrid->receptor_name + ".pdbqt";
+			mypars->receptor_atoms = read_receptor(receptor_name.c_str(),mygrid,mypars->receptor_map,mypars->receptor_map_list);
+			mypars->nr_receptor_atoms = mypars->receptor_atoms.size();
+		}
+		// Adding flex res atom information needed for analysis
+		if(mypars->flexresfile!=NULL){
+			std::vector<ReceptorAtom> flexresatoms = read_receptor_atoms(mypars->flexresfile);
+			mypars->receptor_atoms.insert(mypars->receptor_atoms.end(), flexresatoms.begin(), flexresatoms.end());
+			for(int i=myligand_init.true_ligand_atoms; i<myligand_init.num_of_atoms; i++){
+				mypars->receptor_atoms[mypars->nr_receptor_atoms+i-myligand_init.true_ligand_atoms].acceptor=myligand_init.acceptor[i];
+				mypars->receptor_atoms[mypars->nr_receptor_atoms+i-myligand_init.true_ligand_atoms].donor=myligand_init.donor[i];
+			}
 		}
 	}
 
-	// Resize grid
-	floatgrids.resize(4*(mygrid.num_of_atypes+2)*mygrid.size_xyz[0]*mygrid.size_xyz[1]*mygrid.size_xyz[2]);
-
-	if (filelist.preload_maps){
-		if (!filelist.maps_are_loaded) { // maps not yet loaded
-			bool got_error = false;
+	// Reading the grid files and storing values if not already done
+	int grid_status;
 #ifdef USE_PIPELINE
-			#pragma omp critical
+	#pragma omp critical
 #endif
-			{
-				if (!filelist.maps_are_loaded) { // maps not yet loaded (but in critical, so only one thread will ever enter this)
-					// Load maps to all_maps
-					if (load_all_maps(mypars.fldfile,
-					                  &mygrid,
-					                  all_maps,
-					                  mypars.cgmaps) != 0)
-					{
-						got_error = true;
-					}
-					filelist.maps_are_loaded = true;
-				}
-			}
-			// Return must be outside pragma
-			if (got_error) {
-				printf("\n\nError in load_all_maps, stopped job.");
-				return 1;
-			}
-		}
-
-		// Copy maps from all_maps
-		if (copy_from_all_maps(&mygrid,
-		                       floatgrids.data(),
-		                       all_maps) != 0)
-		{
-			printf("\n\nError in copy_from_all_maps, stopped job.");
-			return 1;
-		}
-
-		// Specify total number of maps that will be on GPU
-		mygrid.num_of_map_atypes = all_maps.size()-2; // For the two extra maps
-		// Map atom_types used for ligand processing to all_maps so all the maps can stay on GPU
-		if (map_to_all_maps(&mygrid,
-		                    &myligand_init,
-		                    all_maps) !=0)
-		{
-			printf("\n\nError in map_to_all_maps, stopped job.");
-			return 1;
-		}
-	} else {
-		// read receptor in case contact analysis is requested and we haven't done so already (in the preload case above)
-		if(mypars.contact_analysis){
-			std::string receptor_name=mygrid.grid_file_path;
-			if(strlen(mygrid.grid_file_path)>0) receptor_name+="/";
-			receptor_name += mygrid.receptor_name;
-			receptor_name += ".pdbqt";
-			mypars.receptor_atoms = read_receptor(receptor_name.c_str(),&mygrid,mypars.receptor_map,mypars.receptor_map_list);
-			mypars.nr_receptor_atoms = mypars.receptor_atoms.size();
-		}
-		// Reading the grid files and storing values in the memory region pointed by floatgrids
-		if (get_gridvalues_f(&mygrid,
-		                     floatgrids.data(),
-		                     mypars.cgmaps) != 0)
-		{
-			printf("\n\nError in get_gridvalues_f, stopped job.");
-			return 1;
-		}
+	grid_status = get_gridvalues(mygrid);
+	if(grid_status!=0)
+	{
+		printf("\nError in get_gridvalues, stopped job.\n");
+		return 1;
 	}
 
 	//------------------------------------------------------------
 	// Capturing algorithm parameters (command line args)
 	//------------------------------------------------------------
-	char* orig_resn = mypars.resname;
-	if(get_commandpars(&argc, argv, &(mygrid.spacing), &mypars)<0)
+	char* orig_resname;
+	if(mypars->resname)
+		orig_resname = strdup(mypars->resname); // need to copy it since it might get free'd in the next line
+	else
+		orig_resname = strdup(""); // need an empty string for later if no resname has been specified
+
+	if(get_commandpars(&argc, argv, &(mygrid->spacing), mypars)<0)
 		return 1;
 
 	// command-line specified resname with more than one file
-	if (!mypars.xml2dlg){ // if the user specified an xml file, that's the one we want to use
-		if ((orig_resn!=mypars.resname) && (filelist.nfiles>1)){ // add an index to existing name distinguish the files
-			char* tmp = strdup(mypars.resname);
-			char* nrtmp = strdup(std::to_string(i_file+1).c_str());
-			if(mypars.resname) free(mypars.resname);
-			mypars.resname = (char*)malloc((strlen(tmp)+strlen(nrtmp)+2)*sizeof(char));
-			strcpy(mypars.resname, tmp);
-			strcat(mypars.resname,"_");
-			strcat(mypars.resname, nrtmp);
-			free(tmp);
-			free(nrtmp);
+	if (!mypars->xml2dlg){ // if the user specified an xml file, that's the one we want to use
+		if ((strcmp(orig_resname,mypars->resname)!=0) && (filelist.nfiles>1)){ // use resname as prefix
+			char* tmp = (char*)malloc(strlen(mypars->resname)+strlen(orig_resname)+1);
+			// take care of potential directory path
+			long long dir = strrchr(orig_resname,'/')-orig_resname+1;
+			if(dir>0){
+				strncpy(tmp, orig_resname, dir);
+				tmp[dir]='\0';
+				strcat(tmp, mypars->resname);
+				strcat(tmp, &orig_resname[dir]);
+			} else{
+				strcpy(tmp, mypars->resname);
+				strcat(tmp, orig_resname);
+			}
+			free(mypars->resname);
+			mypars->resname = tmp;
 		}
 	}
+	free(orig_resname);
+	
+	struct stat res_stat;
+	int res_int = stat(get_filepath(mypars->resname).c_str(), &res_stat);
+	if ((res_int != 0) || !(res_stat.st_mode & S_IFDIR)){
+		printf("\nError: Specified directory \"%s\" for output files (e.g. with `--resnam`) does not exist.\n",get_filepath(mypars->resname).c_str());
+		exit(11);
+	}
 
 	Gridinfo mydummygrid;
 	// if -lxrayfile provided, then read xray ligand data
-	if (mypars.given_xrayligandfile == true)
+	if (mypars->given_xrayligandfile)
 	{
-		if (init_liganddata(mypars.xrayligandfile,
-		                    "\0",
+
+		if (init_liganddata(mypars->xrayligandfile,
+		                    mypars->flexresfile,
 		                    &myxrayligand,
-		                    &mydummygrid,
-		                    0,
-		                    NULL,
-		                    mypars.cgmaps) != 0)
+		                    mygrid,
+		                    mypars->nr_deriv_atypes,
+		                    mypars->deriv_atypes) != 0)
 		{
-			printf("\n\nError in init_liganddata, stopped job.");
+			printf("\nError in init_liganddata, stopped job.\n");
 			return 1;
 		}
 
-		if (get_liganddata(mypars.xrayligandfile,
-		                   "\0",
-		                   &myxrayligand,
-		                   mypars.coeffs.AD4_coeff_vdW,
-		                   mypars.coeffs.AD4_coeff_hb,
-		                   mypars.nr_deriv_atypes,
-		                   mypars.deriv_atypes,
-		                   mypars.nr_mod_atype_pairs,
-		                   mypars.mod_atype_pairs) != 0)
+		// Filling myligand according to the pdbqt file
+		if (parse_liganddata(&myxrayligand,
+		                     mygrid,
+		                     mypars->coeffs.AD4_coeff_vdW,
+		                     mypars->coeffs.AD4_coeff_hb,
+		                     mypars->nr_deriv_atypes,
+		                     mypars->deriv_atypes,
+		                     mypars->nr_mod_atype_pairs,
+		                     mypars->mod_atype_pairs) != 0)
 		{
-			printf("\n\nError in get_liganddata, stopped job.");
+			printf("\nError in parse_liganddata, stopped job.\n");
 			return 1;
 		}
 	}
@@ -428,161 +416,44 @@ int setup(
 	//------------------------------------------------------------
 	// Calculating energies of reference ligand if required
 	//------------------------------------------------------------
-	if (mypars.reflig_en_required) {
-		print_ref_lig_energies_f(myligand_init,
-		                         mypars.smooth,
-		                         mygrid,
-		                         floatgrids.data(),
-		                         mypars.coeffs.scaled_AD4_coeff_elec,
-		                         mypars.elec_min_distance,
-		                         mypars.coeffs.AD4_coeff_desolv,
-		                         mypars.qasp,
-		                         mypars.nr_mod_atype_pairs,
-		                         mypars.mod_atype_pairs);
-	}
-
-	return 0;
-}
-
-int fill_maplist(
-                 const char*             fldfilename,
-                       std::vector<Map>& all_maps)
-{
-	std::ifstream file(fldfilename);
-	if(file.fail()){
-		printf("\nError: Could not open %s. Check path and permissions.",fldfilename);
-		return 1;
-	}
-	std::string line;
-	while(std::getline(file, line)) {
-		std::stringstream sline(line.c_str());
-		// Split line by spaces:
-		std::string word;
-		bool is_variable_line=false;
-		while(std::getline(sline, word, ' ')){
-			// Check if first word is "variable"
-			if (word.compare("variable") == 0) is_variable_line=true;
-			int len = word.size();
-                        if (is_variable_line && len>=4 && word.compare(len-4,4,".map") == 0){ // Found a word that ends in "map"
-				// Split the map into segments e.g. protein.O.map -> "protein", "O", "map"
-				std::stringstream mapword(word.c_str());
-				std::string segment;
-				std::vector<std::string> seglist;
-				while(std::getline(mapword, segment, '.')) seglist.push_back(segment);
-
-				// Create a new map with the atom name
-				all_maps.push_back(Map(seglist[seglist.size()-2]));
-			}
-		}
-	}
-	return 0;
-}
-
-int load_all_maps(
-                  const char*             fldfilename,
-                  const Gridinfo*         mygrid,
-                        std::vector<Map>& all_maps,
-                        bool              cgmaps
-                 )
-{
-	// First, parse .fld file to get map names
-	if(fill_maplist(fldfilename,all_maps)==1) return 1;
-
-	// Now fill the maps
-	int x, y, z;
-	FILE* fp;
-	size_t len = strlen(mygrid->grid_file_path)+strlen(mygrid->receptor_name)+1;
-	if(strlen(mygrid->map_base_name)>len)
-		len = strlen(mygrid->map_base_name);
-	len += 10; // "..map\0" = 6 entries + 4 at most for grid type
-	if(len<128) len=128;
-	char* tempstr = (char*)malloc(len*sizeof(char));
-	int size_of_one_map = 4*mygrid->size_xyz[0]*mygrid->size_xyz[1]*mygrid->size_xyz[2];
-
-	for (unsigned int t=0; t < all_maps.size(); t++)
-	{
-		all_maps[t].grid.resize(size_of_one_map);
-		float* mypoi = all_maps[t].grid.data();
-		// opening corresponding .map file
-		strcpy(tempstr,mygrid->map_base_name);
-		strcat(tempstr, ".");
-		strcat(tempstr, all_maps[t].atype.c_str());
-		strcat(tempstr, ".map");
-		fp = fopen(tempstr, "rb"); // fp = fopen(tempstr, "r");
-		if (fp == NULL){ // try again with the receptor name in the .maps.fld file
-			strcpy(tempstr,mygrid->grid_file_path);
-			strcat(tempstr, "/");
-			strcat(tempstr, mygrid->receptor_name);
-			strcat(tempstr, ".");
-			strcat(tempstr, all_maps[t].atype.c_str());
-			strcat(tempstr, ".map");
-			fp = fopen(tempstr, "rb"); // fp = fopen(tempstr, "r");
-		}
-		if (fp == NULL)
-		{
-			printf("Error: can't open %s!\n", tempstr);
-			if ((strncmp(all_maps[t].atype.c_str(),"CG",2)==0) ||
-			    (strncmp(all_maps[t].atype.c_str(),"G",1)==0))
-			{
-				if(cgmaps)
-					printf("-> Expecting an individual map for each CGx and Gx (x=0..9) atom type.\n");
-				else
-					printf("-> Expecting one map file, ending in .CG.map and .G0.map, for CGx and Gx atom types, respectively.\n");
-				}
-			return 1;
-		}
-
-		// seeking to first data
-		do    fscanf(fp, "%127s", tempstr);
-		while (strcmp(tempstr, "CENTER") != 0);
-		fscanf(fp, "%127s", tempstr);
-		fscanf(fp, "%127s", tempstr);
-		fscanf(fp, "%127s", tempstr);
-
-		unsigned int g1 = mygrid->size_xyz[0];
-		unsigned int g2 = g1*mygrid->size_xyz[1];
-		// reading values
-		for (z=0; z < mygrid->size_xyz[2]; z++)
-			for (y=0; y < mygrid->size_xyz[1]; y++)
-				for (x=0; x < mygrid->size_xyz[0]; x++)
-				{
-					fscanf(fp, "%f", mypoi);
-					// fill in duplicate data for linearized memory access in kernel
-					if(y>0) *(mypoi-4*g1+1) = *mypoi;
-					if(z>0) *(mypoi-4*g2+2) = *mypoi;
-					if(y>0 && z>0) *(mypoi-4*(g2+g1)+3) = *mypoi;
-					mypoi+=4;
-				}
-
-		fclose(fp);
-	}
-	free(tempstr);
-	return 0;
-}
-
-int copy_from_all_maps(
-                       const Gridinfo*         mygrid,
-                             float*            fgrids,
-                             std::vector<Map>& all_maps
-                      )
-{
-	int size_of_one_map = 4*mygrid->size_xyz[0]*mygrid->size_xyz[1]*mygrid->size_xyz[2];
-	for (int t=0; t < mygrid->num_of_atypes+2; t++) {
-		// Look in all_maps for desired map
-		int i_map = -1;
-		for (unsigned int i_atype=0; i_atype < all_maps.size(); i_atype++){
-			if (strcmp(mygrid->grid_types[t],all_maps[i_atype].atype.c_str())==0){
-				i_map = i_atype; // Found the map!
-				break;
-			}
-		}
-		if (i_map == -1){ // Didnt find the map
-			printf("\nError: The %s map needed for the ligand was not found in the .fld file!", mygrid->grid_types[t]);
-			return 1;
-		}
-
-		// Copy from all_maps into fgrids
-		memcpy(fgrids+t*size_of_one_map,all_maps[i_map].grid.data(),sizeof(float)*all_maps[i_map].grid.size());
+	if (mypars->reflig_en_required) {
+		IntraTables tables(&myligand_init, mypars->coeffs.scaled_AD4_coeff_elec, mypars->coeffs.AD4_coeff_desolv, mypars->qasp, mypars->nr_mod_atype_pairs, mypars->mod_atype_pairs);
+		printf("\n");
+		if(mypars->given_xrayligandfile)
+			printf("Reference");
+		else
+			printf("Input");
+		printf(" ligand energies");
+#ifdef TOOLMODE
+		if(mypars->contact_analysis) printf(" and contact analysis");
+#endif
+		printf(":\n");
+		if (mypars->given_xrayligandfile)
+			ligand_calc_output(stdout,
+			                   "",
+			                   &tables,
+			                   &myxrayligand,
+			                   mypars,
+			                   mygrid,
+#ifdef TOOLMODE
+			                   mypars->contact_analysis,
+#else
+			                   false,
+#endif
+			                   true);
+		else
+			ligand_calc_output(stdout,
+			                   "",
+			                   &tables,
+			                   &myligand_init,
+			                   mypars,
+			                   mygrid,
+#ifdef TOOLMODE
+			                   mypars->contact_analysis,
+#else
+			                   false,
+#endif
+			                   true);
 	}
 
 	return 0;
diff --git a/link_cuda.sh b/link_cuda.sh
deleted file mode 100755
index 6b27f162..00000000
--- a/link_cuda.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Link Cuda code files for compilation
-
-ln -sf performdocking.h.Cuda host/inc/performdocking.h
-ln -sf performdocking.cpp.Cuda host/src/performdocking.cpp
diff --git a/link_opencl.sh b/link_opencl.sh
deleted file mode 100755
index b2b0980e..00000000
--- a/link_opencl.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Link OpenCL code files for compilation
-
-ln -sf performdocking.h.OpenCL host/inc/performdocking.h
-ln -sf performdocking.cpp.OpenCL host/src/performdocking.cpp
diff --git a/wrapcl/inc/BufferObjects.h b/wrapcl/inc/BufferObjects.h
index e1c9e533..e43b0603 100644
--- a/wrapcl/inc/BufferObjects.h
+++ b/wrapcl/inc/BufferObjects.h
@@ -47,7 +47,7 @@ int memcopyBufferObjectToDevice(
                                 cl_command_queue cmd_queue,
                                 cl_mem           dest,
                                 bool             blocking,
-                                void*            src,
+                                const void*      src,
                                 size_t           size
                                );
 
diff --git a/wrapcl/src/BufferObjects.cpp b/wrapcl/src/BufferObjects.cpp
index a3905532..0f07acf9 100644
--- a/wrapcl/src/BufferObjects.cpp
+++ b/wrapcl/src/BufferObjects.cpp
@@ -214,7 +214,7 @@ int memcopyBufferObjectToDevice(
                                 cl_command_queue cmd_queue,
                                 cl_mem           dest,
                                 bool             blocking,
-                                void*            src,
+                                const void*      src,
                                 size_t           size
                                )
 {
@@ -335,7 +335,7 @@ int unmemMap(
 	                              NULL
 	                             );
 	if (err != CL_SUCCESS){
-		printf("Error: clEnqueueUnmapMemObjetc() %d\n", err);
+		printf("Error: clEnqueueUnmapMemObject() %d\n", err);
 		fflush(stdout);
 		return EXIT_FAILURE;
 	}