diff --git a/kernel/fpt/fpt.c b/kernel/fpt/fpt.c
index 798b7323..8e6606e5 100644
--- a/kernel/fpt/fpt.c
+++ b/kernel/fpt/fpt.c
@@ -22,19 +22,23 @@
  * \author Jens Keiner
  */
 
+/* configure header */
 #include "config.h"
 
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <math.h>
+/* complex datatype (maybe) */
 #ifdef HAVE_COMPLEX_H
-#include <complex.h>
+#include<complex.h>
 #endif
 
+/* NFFT headers */
 #include "nfft3.h"
 #include "infft.h"
 
+#include <stdbool.h>
+
+#undef X
+#define X(name) FPT(name)
+
 /* Macros for index calculation. */
 
 /** Minimum degree at top of a cascade */
@@ -44,10 +48,10 @@
 #define K_END_TILDE(x,y) MIN(x,y-1)
 
 /** Index of first block of four functions at level */
-#define FIRST_L(x,y) (LRINT(floor((x)/(double)y)))
+#define FIRST_L(x,y) (LRINT(FLOOR((x)/(R)y)))
 
 /** Index of last block of four functions at level */
-#define LAST_L(x,y) (LRINT(ceil(((x)+1)/(double)y))-1)
+#define LAST_L(x,y) (LRINT(CEIL(((x)+1)/(R)y))-1)
 
 #define N_TILDE(y) (y-1)
 
@@ -60,13 +64,13 @@
  */
 typedef struct fpt_step_
 {
-  bool stable;                            /**< Indicates if the values
-                                               contained represent a fast or
-                                               a slow stabilized step.        */
+  /** Whether this instance stores data for a stable fast step */
+  bool stable;
+
   int Ns;                                 /**< TODO Add comment here.         */
   int ts;                                 /**< TODO Add comment here.         */
-  double **a11,**a12,**a21,**a22;         /**< The matrix components          */
-  double *g;                              /**<                                */
+  R **a11,**a12,**a21,**a22;         /**< The matrix components          */
+  R *g;                              /**<                                */
 } fpt_step;
 
 /**
@@ -76,16 +80,16 @@ typedef struct fpt_data_
 {
   fpt_step **steps;                       /**< The cascade summation steps    */
   int k_start;                            /**< TODO Add comment here.         */
-  double *alphaN;                         /**< TODO Add comment here.         */
-  double *betaN;                          /**< TODO Add comment here.         */
-  double *gammaN;                         /**< TODO Add comment here.         */
-  double alpha_0;                         /**< TODO Add comment here.         */
-  double beta_0;                          /**< TODO Add comment here.         */
-  double gamma_m1;                        /**< TODO Add comment here.         */
+  R *alphaN;                         /**< TODO Add comment here.         */
+  R *betaN;                          /**< TODO Add comment here.         */
+  R *gammaN;                         /**< TODO Add comment here.         */
+  R alpha_0;                         /**< TODO Add comment here.         */
+  R beta_0;                          /**< TODO Add comment here.         */
+  R gamma_m1;                        /**< TODO Add comment here.         */
   /* Data for direct transform. */        /**< TODO Add comment here.         */
-  double *_alpha;                          /**< TODO Add comment here.         */
-  double *_beta;                           /**< TODO Add comment here.         */
-  double *_gamma;                          /**< TODO Add comment here.         */
+  R *_alpha;                          /**< TODO Add comment here.         */
+  R *_beta;                           /**< TODO Add comment here.         */
+  R *_gamma;                          /**< TODO Add comment here.         */
 } fpt_data;
 
 /**
@@ -93,53 +97,53 @@ typedef struct fpt_data_
  */
 typedef struct fpt_set_s_
 {
-  int flags;                              /**< The flags                     */
+  unsigned int flags;                              /**< The flags                     */
   int M;                                  /**< The number of DPT transforms  */
   int N;                                  /**< The transform length. Must be
                                                a power of two.               */
   int t;                                  /**< The exponent of N             */
   fpt_data *dpt;                          /**< The DPT transform data        */
-  double **xcvecs;                        /**< Array of pointers to arrays
+  R **xcvecs;                        /**< Array of pointers to arrays
                                                containing the Chebyshev
                                                nodes                         */
-  double *xc;                             /**< Array for Chebychev-nodes.    */
-  double _Complex *temp;                          /**< */
-  double _Complex *work;                          /**< */
-  double _Complex *result;                        /**< */
-  double _Complex *vec3;
-  double _Complex *vec4;
-  double _Complex *z;
-  fftw_plan *plans_dct3;                  /**< Transform plans for the fftw
+  R *xc;                             /**< Array for Chebychev-nodes.    */
+  C *temp;                          /**< */
+  C *work;                          /**< */
+  C *result;                        /**< */
+  C *vec3;
+  C *vec4;
+  C *z;
+  FFTW(plan) *plans_dct3;                  /**< Transform plans for the fftw
                                                library                       */
-  fftw_plan *plans_dct2;                  /**< Transform plans for the fftw
+  FFTW(plan) *plans_dct2;                  /**< Transform plans for the fftws
                                                library                       */
-  fftw_r2r_kind *kinds;                   /**< Transform kinds for fftw
+  FFTW(r2r_kind) *kinds;                   /**< Transform kinds for fftw
                                                library                       */
-  fftw_r2r_kind *kindsr;                  /**< Transform kinds for fftw
+  FFTW(r2r_kind) *kindsr;                  /**< Transform kinds for fftw
                                                library                       */
 
   int *lengths; /**< Transform lengths for fftw library */
 
   /* Data for slow transforms. */
-  double *xc_slow;
+  R *xc_slow;
 } fpt_set_s;
 
-static inline void abuvxpwy(double a, double b, double _Complex* u, double _Complex* x,
-  double* v, double _Complex* y, double* w, int n)
+static inline void abuvxpwy(R a, R b, C* u, C* x,
+  R* v, C* y, R* w, int n)
 {
-  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
-  double *v_ptr = v, *w_ptr = w;
+  int l; C *u_ptr = u, *x_ptr = x, *y_ptr = y;
+  R *v_ptr = v, *w_ptr = w;
   for (l = 0; l < n; l++)
     *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
 }
 
 #define ABUVXPWY_SYMMETRIC(NAME,S1,S2) \
-static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
-  double* v, double _Complex* y, double* w, int n) \
+static inline void NAME(R a, R b, C* u, C* x, \
+  R* v, C* y, R* w, int n) \
 { \
   const int n2 = n>>1; \
-  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
-  double *v_ptr = v, *w_ptr = w; \
+  int l; C *u_ptr = u, *x_ptr = x, *y_ptr = y; \
+  R *v_ptr = v, *w_ptr = w; \
   for (l = 0; l < n2; l++) \
     *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++)); \
   v_ptr--; w_ptr--; \
@@ -151,12 +155,12 @@ ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric1,1.0,-1.0)
 ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric2,-1.0,1.0)
 
 #define ABUVXPWY_SYMMETRIC_1(NAME,S1) \
-static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
-  double* v, double _Complex* y, int n, double *xx) \
+static inline void NAME(R a, R b, C* u, C* x, \
+  R* v, C* y, int n, R *xx) \
 { \
   const int n2 = n>>1; \
-  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
-  double *v_ptr = v, *xx_ptr = xx; \
+  int l; C *u_ptr = u, *x_ptr = x, *y_ptr = y; \
+  R *v_ptr = v, *xx_ptr = xx; \
   for (l = 0; l < n2; l++, v_ptr++) \
     *u_ptr++ = a * (b * (*v_ptr) * (*x_ptr++) + ((*v_ptr)*(1.0+*xx_ptr++)) * (*y_ptr++)); \
   v_ptr--; \
@@ -168,12 +172,12 @@ ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_1,1.0)
 ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_2,-1.0)
 
 #define ABUVXPWY_SYMMETRIC_2(NAME,S1) \
-static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
-  double _Complex* y, double* w, int n, double *xx) \
+static inline void NAME(R a, R b, C* u, C* x, \
+  C* y, R* w, int n, R *xx) \
 { \
   const int n2 = n>>1; \
-  int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
-  double *w_ptr = w, *xx_ptr = xx; \
+  int l; C *u_ptr = u, *x_ptr = x, *y_ptr = y; \
+  R *w_ptr = w, *xx_ptr = xx; \
   for (l = 0; l < n2; l++, w_ptr++) \
     *u_ptr++ = a * (b * (*w_ptr/(1.0+*xx_ptr++)) * (*x_ptr++) + (*w_ptr) * (*y_ptr++)); \
   w_ptr--; \
@@ -184,23 +188,23 @@ static inline void NAME(double a, double b, double _Complex* u, double _Complex*
 ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_1,1.0)
 ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_2,-1.0)
 
-static inline void auvxpwy(double a, double _Complex* u, double _Complex* x, double* v,
-  double _Complex* y, double* w, int n)
+static inline void auvxpwy(R a, C* u, C* x, R* v,
+  C* y, R* w, int n)
 {
   int l;
-  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
-  double *v_ptr = v, *w_ptr = w;
+  C *u_ptr = u, *x_ptr = x, *y_ptr = y;
+  R *v_ptr = v, *w_ptr = w;
   for (l = n; l > 0; l--)
     *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
 }
 
-static inline void auvxpwy_symmetric(double a, double _Complex* u, double _Complex* x,
-  double* v, double _Complex* y, double* w, int n)
+static inline void auvxpwy_symmetric(R a, C* u, C* x,
+  R* v, C* y, R* w, int n)
 {
   const int n2 = n>>1; \
   int l;
-  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
-  double *v_ptr = v, *w_ptr = w;
+  C *u_ptr = u, *x_ptr = x, *y_ptr = y;
+  R *v_ptr = v, *w_ptr = w;
   for (l = n2; l > 0; l--)
     *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
   v_ptr--; w_ptr--;
@@ -208,13 +212,13 @@ static inline void auvxpwy_symmetric(double a, double _Complex* u, double _Compl
     *u_ptr++ = a * ((*v_ptr--) * (*x_ptr++) - (*w_ptr--) * (*y_ptr++));
 }
 
-static inline void auvxpwy_symmetric_1(double a, double _Complex* u, double _Complex* x,
-  double* v, double _Complex* y, double* w, int n, double *xx)
+static inline void auvxpwy_symmetric_1(R a, C* u, C* x,
+  R* v, C* y, R* w, int n, R *xx)
 {
   const int n2 = n>>1; \
   int l;
-  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
-  double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
+  C *u_ptr = u, *x_ptr = x, *y_ptr = y;
+  R *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
   for (l = n2; l > 0; l--, xx_ptr++)
     *u_ptr++ = a * (((*v_ptr++)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)*(1.0+*xx_ptr)) * (*y_ptr++));
   v_ptr--; w_ptr--;
@@ -222,13 +226,13 @@ static inline void auvxpwy_symmetric_1(double a, double _Complex* u, double _Com
     *u_ptr++ = a * (-((*v_ptr--)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr--)*(1.0+*xx_ptr)) * (*y_ptr++));
 }
 
-static inline void auvxpwy_symmetric_2(double a, double _Complex* u, double _Complex* x,
-  double* v, double _Complex* y, double* w, int n, double *xx)
+static inline void auvxpwy_symmetric_2(R a, C* u, C* x,
+  R* v, C* y, R* w, int n, R *xx)
 {
   const int n2 = n>>1; \
   int l;
-  double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
-  double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
+  C *u_ptr = u, *x_ptr = x, *y_ptr = y;
+  R *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
   for (l = n2; l > 0; l--, xx_ptr++)
     *u_ptr++ = a * (((*v_ptr++)/(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)/(1.0+*xx_ptr)) * (*y_ptr++));
   v_ptr--; w_ptr--;
@@ -237,21 +241,21 @@ static inline void auvxpwy_symmetric_2(double a, double _Complex* u, double _Com
 }
 
 #define FPT_DO_STEP(NAME,M1_FUNCTION,M2_FUNCTION) \
-static inline void NAME(double _Complex  *a, double _Complex *b, double *a11, double *a12, \
-  double *a21, double *a22, double g, int tau, fpt_set set) \
+static inline void NAME(C  *a, C *b, R *a11, R *a12, \
+  R *a21, R *a22, R g, int tau, fpt_set set) \
 { \
   /** The length of the coefficient arrays. */ \
   int length = 1<<(tau+1); \
   /** Twice the length of the coefficient arrays. */ \
-  double norm = 1.0/(length<<1); \
+  R norm = 1.0/(length<<1); \
   \
   /* Compensate for factors introduced by a raw DCT-III. */ \
   a[0] *= 2.0; \
   b[0] *= 2.0; \
   \
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a); \
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b); \
   \
   /*for (k = 0; k < length; k++)*/ \
   /*{*/ \
@@ -272,15 +276,15 @@ static inline void NAME(double _Complex  *a, double _Complex *b, double *a11, do
     /* Perform multiplication for both rows. */ \
     M2_FUNCTION(norm,set->z,b,a22,a,a21,length); \
     M1_FUNCTION(norm*g,a,a,a11,b,a12,length); \
-    memcpy(b,set->z,length*sizeof(double _Complex)); \
+    memcpy(b,set->z, (size_t)(length) * sizeof(C)); \
     /* Compute Chebyshev-coefficients using a DCT-II. */ \
-    fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
+    fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a); \
     /* Compensate for factors introduced by a raw DCT-II. */ \
     a[0] *= 0.5; \
   } \
   \
   /* Compute Chebyshev-coefficients using a DCT-II. */ \
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b); \
   /* Compensate for factors introduced by a raw DCT-II. */ \
   b[0] *= 0.5; \
 }
@@ -290,14 +294,14 @@ FPT_DO_STEP(fpt_do_step_symmetric,auvxpwy_symmetric,auvxpwy_symmetric)
 /*FPT_DO_STEP(fpt_do_step_symmetric_u,auvxpwy_symmetric,auvxpwy)
 FPT_DO_STEP(fpt_do_step_symmetric_l,auvxpwy,auvxpwy_symmetric)*/
 
-static inline void fpt_do_step_symmetric_u(double _Complex *a, double _Complex *b,
-  double *a11, double *a12, double *a21, double *a22, double *x,
-  double gam, int tau, fpt_set set)
+static inline void fpt_do_step_symmetric_u(C *a, C *b,
+  R *a11, R *a12, R *a21, R *a22, R *x,
+  R gam, int tau, fpt_set set)
 {
   /** The length of the coefficient arrays. */
   int length = 1<<(tau+1);
   /** Twice the length of the coefficient arrays. */
-  double norm = 1.0/(length<<1);
+  R norm = 1.0/(length<<1);
 
   UNUSED(a21); UNUSED(a22);
 
@@ -306,8 +310,8 @@ static inline void fpt_do_step_symmetric_u(double _Complex *a, double _Complex *
   b[0] *= 2.0;
 
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b);
 
   /*for (k = 0; k < length; k++)*/
   /*{*/
@@ -328,26 +332,26 @@ static inline void fpt_do_step_symmetric_u(double _Complex *a, double _Complex *
     /* Perform multiplication for both rows. */
     auvxpwy_symmetric_1(norm,set->z,b,a12,a,a11,length,x);
     auvxpwy_symmetric(norm*gam,a,a,a11,b,a12,length);
-    memcpy(b,set->z,length*sizeof(double _Complex));
+    memcpy(b,set->z,(size_t)(length) * sizeof(C));
     /* Compute Chebyshev-coefficients using a DCT-II. */
-    fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
+    fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a);
     /* Compensate for factors introduced by a raw DCT-II. */
     a[0] *= 0.5;
   }
 
   /* Compute Chebyshev-coefficients using a DCT-II. */
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b);
   /* Compensate for factors introduced by a raw DCT-II. */
   b[0] *= 0.5;
 }
 
-static inline void fpt_do_step_symmetric_l(double _Complex  *a, double _Complex *b,
-  double *a11, double *a12, double *a21, double *a22, double *x, double gam, int tau, fpt_set set)
+static inline void fpt_do_step_symmetric_l(C  *a, C *b,
+  R *a11, R *a12, R *a21, R *a22, R *x, R gam, int tau, fpt_set set)
 {
   /** The length of the coefficient arrays. */
   int length = 1<<(tau+1);
   /** Twice the length of the coefficient arrays. */
-  double norm = 1.0/(length<<1);
+  R norm = 1.0/(length<<1);
 
   /* Compensate for factors introduced by a raw DCT-III. */
   a[0] *= 2.0;
@@ -356,8 +360,8 @@ static inline void fpt_do_step_symmetric_l(double _Complex  *a, double _Complex
   UNUSED(a11); UNUSED(a12);
 
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b);
 
   /*for (k = 0; k < length; k++)*/
   /*{*/
@@ -378,40 +382,40 @@ static inline void fpt_do_step_symmetric_l(double _Complex  *a, double _Complex
     /* Perform multiplication for both rows. */
     auvxpwy_symmetric(norm,set->z,b,a22,a,a21,length);
     auvxpwy_symmetric_2(norm*gam,a,a,a21,b,a22,length,x);
-    memcpy(b,set->z,length*sizeof(double _Complex));
+    memcpy(b,set->z, (size_t)(length) * sizeof(C));
     /* Compute Chebyshev-coefficients using a DCT-II. */
-    fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
+    fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a);
     /* Compensate for factors introduced by a raw DCT-II. */
     a[0] *= 0.5;
   }
 
   /* Compute Chebyshev-coefficients using a DCT-II. */
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b);
   /* Compensate for factors introduced by a raw DCT-II. */
   b[0] *= 0.5;
 }
 
 #define FPT_DO_STEP_TRANSPOSED(NAME,M1_FUNCTION,M2_FUNCTION) \
-static inline void NAME(double _Complex  *a, double _Complex *b, double *a11, \
-  double *a12, double *a21, double *a22, double g, int tau, fpt_set set) \
+static inline void NAME(C  *a, C *b, R *a11, \
+  R *a12, R *a21, R *a22, R g, int tau, fpt_set set) \
 { \
   /** The length of the coefficient arrays. */ \
   int length = 1<<(tau+1); \
   /** Twice the length of the coefficient arrays. */ \
-  double norm = 1.0/(length<<1); \
+  R norm = 1.0/(length<<1); \
   \
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a); \
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b); \
   \
   /* Perform matrix multiplication. */ \
   M1_FUNCTION(norm,g,set->z,a,a11,b,a21,length); \
   M2_FUNCTION(norm,g,b,a,a12,b,a22,length); \
-  memcpy(a,set->z,length*sizeof(double _Complex)); \
+  memcpy(a,set->z, (size_t)(length) * sizeof(C)); \
   \
   /* Compute Chebyshev-coefficients using a DCT-II. */ \
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a); \
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b); \
 }
 
 FPT_DO_STEP_TRANSPOSED(fpt_do_step_t,abuvxpwy,abuvxpwy)
@@ -420,64 +424,64 @@ FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric,abuvxpwy_symmetric1,abuvxpwy_symm
 /*FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric_l,abuvxpwy_symmetric2_2,abuvxpwy_symmetric2_1)*/
 
 
-static inline void fpt_do_step_t_symmetric_u(double _Complex  *a,
-  double _Complex *b, double *a11, double *a12, double *x,
-  double gam, int tau, fpt_set set)
+static inline void fpt_do_step_t_symmetric_u(C  *a,
+  C *b, R *a11, R *a12, R *x,
+  R gam, int tau, fpt_set set)
 {
   /** The length of the coefficient arrays. */
   int length = 1<<(tau+1);
   /** Twice the length of the coefficient arrays. */
-  double norm = 1.0/(length<<1);
+  R norm = 1.0/(length<<1);
 
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b);
 
   /* Perform matrix multiplication. */
   abuvxpwy_symmetric1_1(norm,gam,set->z,a,a11,b,length,x);
   abuvxpwy_symmetric1_2(norm,gam,b,a,a12,b,length,x);
-  memcpy(a,set->z,length*sizeof(double _Complex));
+  memcpy(a,set->z, (size_t)(length) * sizeof(C));
 
   /* Compute Chebyshev-coefficients using a DCT-II. */
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b);
 }
 
-static inline void fpt_do_step_t_symmetric_l(double _Complex  *a,
-  double _Complex *b, double *a21, double *a22,
-  double *x, double gam, int tau, fpt_set set)
+static inline void fpt_do_step_t_symmetric_l(C  *a,
+  C *b, R *a21, R *a22,
+  R *x, R gam, int tau, fpt_set set)
 {
   /** The length of the coefficient arrays. */
   int length = 1<<(tau+1);
   /** Twice the length of the coefficient arrays. */
-  double norm = 1.0/(length<<1);
+  R norm = 1.0/(length<<1);
 
   /* Compute function values from Chebyshev-coefficients using a DCT-III. */
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct3[tau-1],(R*)b,(R*)b);
 
   /* Perform matrix multiplication. */
   abuvxpwy_symmetric2_2(norm,gam,set->z,a,b,a21,length,x);
   abuvxpwy_symmetric2_1(norm,gam,b,a,b,a22,length,x);
-  memcpy(a,set->z,length*sizeof(double _Complex));
+  memcpy(a,set->z, (size_t)(length) * sizeof(C));
 
   /* Compute Chebyshev-coefficients using a DCT-II. */
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
-  fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)a,(R*)a);
+  fftw_execute_r2r(set->plans_dct2[tau-1],(R*)b,(R*)b);
 }
 
 
-static void eval_clenshaw(const double *x, double *y, int size, int k, const double *alpha,
-  const double *beta, const double *gam)
+static void eval_clenshaw(const R *x, R *y, int size, int k, const R *alpha,
+  const R *beta, const R *gam)
 {
   /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
    * of knots  x[0], ..., x[size-1] by the Clenshaw algorithm
    */
   int i,j;
-  double a,b,x_val_act,a_old;
-  const double *x_act;
-  double *y_act;
-  const double *alpha_act, *beta_act, *gamma_act;
+  R a,b,x_val_act,a_old;
+  const R *x_act;
+  R *y_act;
+  const R *alpha_act, *beta_act, *gamma_act;
 
   /* Traverse all nodes. */
   x_act = x;
@@ -513,17 +517,17 @@ static void eval_clenshaw(const double *x, double *y, int size, int k, const dou
   }
 }
 
-static void eval_clenshaw2(const double *x, double *z, double *y, int size1, int size, int k, const double *alpha,
-  const double *beta, const double *gam)
+static void eval_clenshaw2(const R *x, R *z, R *y, int size1, int size, int k, const R *alpha,
+  const R *beta, const R *gam)
 {
   /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
    * of knots  x[0], ..., x[size-1] by the Clenshaw algorithm
    */
   int i,j;
-  double a,b,x_val_act,a_old;
-  const double *x_act;
-  double *y_act, *z_act;
-  const double *alpha_act, *beta_act, *gamma_act;
+  R a,b,x_val_act,a_old;
+  const R *x_act;
+  R *y_act, *z_act;
+  const R *alpha_act, *beta_act, *gamma_act;
 
   /* Traverse all nodes. */
   x_act = x;
@@ -569,18 +573,18 @@ static void eval_clenshaw2(const double *x, double *z, double *y, int size1, int
   fclose(f);*/
 }
 
-static int eval_clenshaw_thresh2(const double *x, double *z, double *y, int size, int k,
-  const double *alpha, const double *beta, const double *gam, const
-  double threshold)
+static int eval_clenshaw_thresh2(const R *x, R *z, R *y, int size, int k,
+  const R *alpha, const R *beta, const R *gam, const
+  R threshold)
 {
   /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
    * of knots  x[0], ..., x[size-1] by the Clenshaw algorithm
    */
   int i,j;
-  double a,b,x_val_act,a_old;
-  const double *x_act;
-  double *y_act, *z_act;
-  const double *alpha_act, *beta_act, *gamma_act;
+  R a,b,x_val_act,a_old;
+  const R *x_act;
+  R *y_act, *z_act;
+  const R *alpha_act, *beta_act, *gamma_act;
   R max = -nfft_float_property(NFFT_R_MAX);
   const R t = LOG10(FABS(threshold));
 
@@ -627,13 +631,13 @@ static int eval_clenshaw_thresh2(const double *x, double *z, double *y, int size
 }
 
 static inline void eval_sum_clenshaw_fast(const int N, const int M,
-  const double _Complex *a, const double *x, double _Complex *y,
-  const double *alpha, const double *beta, const double *gam,
-  const double lambda)
+  const C *a, const R *x, C *y,
+  const R *alpha, const R *beta, const R *gam,
+  const R lambda)
 {
   int j,k;
-  double _Complex tmp1, tmp2, tmp3;
-  double xc;
+  C tmp1, tmp2, tmp3;
+  R xc;
   
   /*fprintf(stderr, "Executing eval_sum_clenshaw_fast.\n");  
   fprintf(stderr, "Before transform:\n");  
@@ -705,18 +709,18 @@ static inline void eval_sum_clenshaw_fast(const int N, const int M,
  *   (alpha_k, beta_k, gamma_k \in \mathbb{R},\; k \ge 0)
  * \f]
  * with initial conditions \f$P_{-1}(x) := 0\f$, \f$P_0(x) := \lambda\f$
- * for given double _Complex coefficients \f$\left(a_k\right)_{k=0}^N \in
+ * for given complex coefficients \f$\left(a_k\right)_{k=0}^N \in
  * \mathbb{C}^{N+1}\f$ at given nodes \f$\left(x_j\right)_{j=0}^M \in
  * \mathbb{R}^{M+1}\f$, \f$M \in \mathbb{N}_0\f$.
  */
-static void eval_sum_clenshaw_transposed(int N, int M, double _Complex* a, double *x,
-  double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam,
-  double lambda)
+static void eval_sum_clenshaw_transposed(int N, int M, C* a, R *x,
+  C *y, C *temp, R *alpha, R *beta, R *gam,
+  R lambda)
 {
   int j,k;
-  double _Complex* it1 = temp;
-  double _Complex* it2 = y;
-  double _Complex aux;
+  C* it1 = temp;
+  C* it2 = y;
+  C aux;
 
   /* Compute final result by multiplying with the constant lambda */
   a[0] = 0.0;
@@ -766,7 +770,7 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
 #endif
 
   /* Allocate memory for new DPT set. */
-  fpt_set_s *set = (fpt_set_s*)nfft_malloc(sizeof(fpt_set_s));
+  fpt_set_s *set = (fpt_set_s*)Y(malloc)(sizeof(fpt_set_s));
 
   /* Save parameters in structure. */
   set->flags = flags;
@@ -776,7 +780,7 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
   set->N = 1<<t;
 
   /* Allocate memory for M transforms. */
-  set->dpt = (fpt_data*) nfft_malloc(M*sizeof(fpt_data));
+  set->dpt = (fpt_data*) Y(malloc)((size_t)(M) * sizeof(fpt_data));
 
   /* Initialize with NULL pointer. */
   for (m = 0; m < set->M; m++)
@@ -790,14 +794,14 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
    * factor 2 introduced by the DCT-III, we set this coefficient to 0.5 here. */
 
   /* Allocate memory for array of pointers to node arrays. */
-  set->xcvecs = (double**) nfft_malloc((set->t)*sizeof(double*));
+  set->xcvecs = (R**) Y(malloc)((size_t)(set->t) * sizeof(R*));
   /* For each polynomial length starting with 4, compute the Chebyshev nodes
    * using a DCT-III. */
   plength = 4;
   for (tau = 1; tau < t+1; tau++)
   {
     /* Allocate memory for current array. */
-    set->xcvecs[tau-1] = (double*) nfft_malloc(plength*sizeof(double));
+    set->xcvecs[tau-1] = (R*) Y(malloc)((size_t)(plength) * sizeof(R));
     for (k = 0; k < plength; k++)
     {
       set->xcvecs[tau-1][k] = cos(((k+0.5)*KPI)/plength);
@@ -806,12 +810,12 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
   }
 
   /** Allocate memory for auxilliary arrays. */
-  set->work = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
-  set->result = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
+  set->work = (C*) Y(malloc)((size_t)(2 * set->N) * sizeof(C));
+  set->result = (C*) Y(malloc)((size_t)(2 * set->N) * sizeof(C));
 
-  set->lengths = (int*) nfft_malloc((set->t/*-1*/)*sizeof(int));
-  set->plans_dct2 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
-  set->kindsr     = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
+  set->lengths = (int*) Y(malloc)((size_t)(set->t) * sizeof(int));
+  set->plans_dct2 = (fftw_plan*) Y(malloc)((size_t)(set->t) * sizeof(fftw_plan));
+  set->kindsr     = (fftw_r2r_kind*) Y(malloc)(2 * sizeof(fftw_r2r_kind));
   set->kindsr[0]  = FFTW_REDFT10;
   set->kindsr[1]  = FFTW_REDFT10;
   for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
@@ -823,8 +827,8 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
     fftw_plan_with_nthreads(nthreads);
 #endif
     set->plans_dct2[tau] =
-      fftw_plan_many_r2r(1, &set->lengths[tau], 2, (double*)set->work, NULL,
-                         2, 1, (double*)set->result, NULL, 2, 1,set->kindsr,
+      fftw_plan_many_r2r(1, &set->lengths[tau], 2, (R*)set->work, NULL,
+                         2, 1, (R*)set->result, NULL, 2, 1,set->kindsr,
                          0);
 #ifdef _OPENMP
 }
@@ -835,13 +839,13 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
   if (!(set->flags & FPT_NO_FAST_ALGORITHM))
   {
     /** Allocate memory for auxilliary arrays. */
-    set->vec3 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
-    set->vec4 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
-    set->z = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
+    set->vec3 = (C*) Y(malloc)((size_t)(set->N) * sizeof(C));
+    set->vec4 = (C*) Y(malloc)((size_t)(set->N) * sizeof(C));
+    set->z = (C*) Y(malloc)((size_t)(set->N) * sizeof(C));
 
     /** Initialize FFTW plans. */
-    set->plans_dct3 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
-    set->kinds      = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
+    set->plans_dct3 = (fftw_plan*) Y(malloc)((size_t)(set->t) * sizeof(fftw_plan));
+    set->kinds      = (fftw_r2r_kind*) Y(malloc)(2 * sizeof(fftw_r2r_kind));
     set->kinds[0]   = FFTW_REDFT01;
     set->kinds[1]   = FFTW_REDFT01;
     for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
@@ -853,8 +857,8 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
     fftw_plan_with_nthreads(nthreads);
 #endif
       set->plans_dct3[tau] =
-        fftw_plan_many_r2r(1, &set->lengths[tau], 2, (double*)set->work, NULL,
-                           2, 1, (double*)set->result, NULL, 2, 1, set->kinds,
+        fftw_plan_many_r2r(1, &set->lengths[tau], 2, (R*)set->work, NULL,
+                           2, 1, (R*)set->result, NULL, 2, 1, set->kinds,
                            0);
 #ifdef _OPENMP
 }
@@ -870,16 +874,16 @@ fpt_set fpt_init(const int M, const int t, const unsigned int flags)
 
   if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
   {
-    set->xc_slow = (double*) nfft_malloc((set->N+1)*sizeof(double));
-    set->temp = (double _Complex*) nfft_malloc((set->N+1)*sizeof(double _Complex));
+    set->xc_slow = (R*) Y(malloc)((size_t)(set->N + 1) * sizeof(R));
+    set->temp = (C*) Y(malloc)((size_t)(set->N + 1) * sizeof(C));
   }
 
   /* Return the newly created DPT set. */
   return set;
 }
 
-void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
-  double *gam, int k_start, const double threshold)
+void fpt_precompute(fpt_set set, const int m, R *alpha, R *beta,
+  R *gam, int k_start, const R threshold)
 {
 
   int tau;          /**< Cascade level                                       */
@@ -894,17 +898,17 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
                          cascade for stabilization                           */
   int degree_stab;  /**< Degree of polynomials for the current level in the
                          cascade for stabilization                           */
-  double *a11;      /**< Array containing function values of the
+  R *a11;      /**< Array containing function values of the
                          (1,1)-component of U_k^n.                           */
-  double *a12;      /**< Array containing function values of the
+  R *a12;      /**< Array containing function values of the
                          (1,2)-component of U_k^n.                           */
-  double *a21;      /**< Array containing function values of the
+  R *a21;      /**< Array containing function values of the
                          (2,1)-component of U_k^n.                           */
-  double *a22;      /**< Array containing function values of the
+  R *a22;      /**< Array containing function values of the
                          (2,2)-component of U_k^n.                           */
-  const double *calpha;
-  const double *cbeta;
-  const double *cgamma;
+  const R *calpha;
+  const R *cbeta;
+  const R *cgamma;
   int needstab = 0; /**< Used to indicate that stabilization is neccessary.  */
   int k_start_tilde;
   int N_tilde;
@@ -930,9 +934,9 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
   if (!(set->flags & FPT_NO_FAST_ALGORITHM))
   {
     /* Save recursion coefficients. */
-    data->alphaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
-    data->betaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
-    data->gammaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
+    data->alphaN = (R*) Y(malloc)((size_t)(set->t - 1) * sizeof(C));
+    data->betaN = (R*) Y(malloc)((size_t)(set->t - 1) * sizeof(C));
+    data->gammaN = (R*) Y(malloc)((size_t)(set->t - 1) * sizeof(C));
 
     for (tau = 2; tau <= set->t; tau++)
     {
@@ -945,12 +949,12 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
     data->alpha_0 = alpha[1];
     data->beta_0 = beta[1];
 
-    k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
+    k_start_tilde = K_START_TILDE(data->k_start,Y(next_power_of_2)(data->k_start)
       /*set->N*/);
     N_tilde = N_TILDE(set->N);
 
     /* Allocate memory for the cascade with t = log_2(N) many levels. */
-    data->steps = (fpt_step**) nfft_malloc(sizeof(fpt_step*)*set->t);
+    data->steps = (fpt_step**) Y(malloc)((size_t)(set->t) * sizeof(fpt_step*));
 
     /* For tau = 1,...t compute the matrices U_{n,tau,l}. */
     plength = 4;
@@ -965,8 +969,8 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
 
       /* Allocate memory for current level. This level will contain 2^{t-tau-1}
        * many matrices. */
-      data->steps[tau] = (fpt_step*) nfft_malloc(sizeof(fpt_step)
-                         * (lastl+1));
+      data->steps[tau] = (fpt_step*) Y(malloc)(sizeof(fpt_step)
+                         * (size_t)(lastl + 1));
 
       /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
       for (l = firstl; l <= lastl; l++)
@@ -983,10 +987,10 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
         }
 
         /* Allocate memory for the components of U_{n,tau,l}. */
-        a11 = (double*) nfft_malloc(sizeof(double)*clength);
-        a12 = (double*) nfft_malloc(sizeof(double)*clength);
-        a21 = (double*) nfft_malloc(sizeof(double)*clength);
-        a22 = (double*) nfft_malloc(sizeof(double)*clength);
+        a11 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
+        a12 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
+        a21 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
+        a22 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
 
         /* Evaluate the associated polynomials at the 2^{tau+1} Chebyshev
          * nodes. */
@@ -1028,11 +1032,11 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
         /* Check if stabilization needed. */
         if (needstab == 0)
         {
-          data->steps[tau][l].a11 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a12 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a21 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a22 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].g = (double*) nfft_malloc(sizeof(double));
+          data->steps[tau][l].a11 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a12 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a21 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a22 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].g = (R*) Y(malloc)(sizeof(R));
           /* No stabilization needed. */
           data->steps[tau][l].a11[0] = a11;
           data->steps[tau][l].a12[0] = a12;
@@ -1045,7 +1049,7 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
         {
           /* Stabilize. */
           degree_stab = degree*(2*l+1);
-          X(next_power_of_2_exp)((l+1)*(1<<(tau+1)),&N_stab,&t_stab);
+          Y(next_power_of_2_exp)((l+1)*(1<<(tau+1)),&N_stab,&t_stab);
 
           /* Old arrays are to small. */
           nfft_free(a11);
@@ -1053,11 +1057,11 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
           nfft_free(a21);
           nfft_free(a22);
 
-          data->steps[tau][l].a11 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a12 = (double**)nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a21 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].a22 = (double**) nfft_malloc(sizeof(double*));
-          data->steps[tau][l].g = (double*) nfft_malloc(sizeof(double));
+          data->steps[tau][l].a11 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a12 = (R**)Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a21 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].a22 = (R**) Y(malloc)(sizeof(R*));
+          data->steps[tau][l].g = (R*) Y(malloc)(sizeof(R));
 
           plength_stab = N_stab;
 
@@ -1069,10 +1073,10 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
               clength_1 = plength_stab;
               clength_2 = plength_stab;
               /* Allocate memory for arrays. */
-              a11 = (double*) nfft_malloc(sizeof(double)*clength_1);
-              a12 = (double*) nfft_malloc(sizeof(double)*clength_1);
-              a21 = (double*) nfft_malloc(sizeof(double)*clength_2);
-              a22 = (double*) nfft_malloc(sizeof(double)*clength_2);
+              a11 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_1);
+              a12 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_1);
+              a21 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_2);
+              a22 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_2);
               /* Get the pointers to the three-term recurrence coeffcients. */
               calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
               eval_clenshaw2(set->xcvecs[t_stab-2], a11, a21, clength_1,
@@ -1085,8 +1089,8 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
               clength = plength_stab/2;
               if (m%2 == 0)
               {
-                a11 = (double*) nfft_malloc(sizeof(double)*clength);
-                a12 = (double*) nfft_malloc(sizeof(double)*clength);
+                a11 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
+                a12 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
                 a21 = 0;
                 a22 = 0;
                 calpha = &(alpha[2]); cbeta = &(beta[2]); cgamma = &(gam[2]);
@@ -1099,8 +1103,8 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
               {
                 a11 = 0;
                 a12 = 0;
-                a21 = (double*) nfft_malloc(sizeof(double)*clength);
-                a22 = (double*) nfft_malloc(sizeof(double)*clength);
+                a21 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
+                a22 = (R*) Y(malloc)(sizeof(R) * (size_t)clength);
                 calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
                 eval_clenshaw(set->xcvecs[t_stab-2], a21, clength,
                   degree_stab-1,calpha, cbeta, cgamma);
@@ -1113,10 +1117,10 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
           {
             clength_1 = plength_stab;
             clength_2 = plength_stab;
-            a11 = (double*) nfft_malloc(sizeof(double)*clength_1);
-            a12 = (double*) nfft_malloc(sizeof(double)*clength_1);
-            a21 = (double*) nfft_malloc(sizeof(double)*clength_2);
-            a22 = (double*) nfft_malloc(sizeof(double)*clength_2);
+            a11 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_1);
+            a12 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_1);
+            a21 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_2);
+            a22 = (R*) Y(malloc)(sizeof(R) * (size_t)clength_2);
             calpha = &(alpha[2]);
             cbeta = &(beta[2]);
             cgamma = &(gam[2]);
@@ -1150,34 +1154,34 @@ void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
     /* Check, if recurrence coefficients must be copied. */
     if (set->flags & FPT_PERSISTENT_DATA)
     {
-      data->_alpha = (double*) alpha;
-      data->_beta = (double*) beta;
-      data->_gamma = (double*) gam;
+      data->_alpha = (R*) alpha;
+      data->_beta = (R*) beta;
+      data->_gamma = (R*) gam;
     }
     else
     {
-      data->_alpha = (double*) nfft_malloc((set->N+1)*sizeof(double));
-      data->_beta = (double*) nfft_malloc((set->N+1)*sizeof(double));
-      data->_gamma = (double*) nfft_malloc((set->N+1)*sizeof(double));
-      memcpy(data->_alpha,alpha,(set->N+1)*sizeof(double));
-      memcpy(data->_beta,beta,(set->N+1)*sizeof(double));
-      memcpy(data->_gamma,gam,(set->N+1)*sizeof(double));
+      data->_alpha = (R*) Y(malloc)((size_t)(set->N + 1) * sizeof(R));
+      data->_beta = (R*) Y(malloc)((size_t)(set->N + 1) * sizeof(R));
+      data->_gamma = (R*) Y(malloc)((size_t)(set->N + 1) * sizeof(R));
+      memcpy(data->_alpha,alpha, (size_t)((set->N + 1)) * sizeof(R));
+      memcpy(data->_beta,beta, (size_t)((set->N + 1)) * sizeof(R));
+      memcpy(data->_gamma,gam, (size_t)((set->N + 1)) * sizeof(R));
     }
   }
 }
 
-void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
+void fpt_trafo_direct(fpt_set set, const int m, const C *x, C *y,
   const int k_end, const unsigned int flags)
 {
   int j;
   fpt_data *data = &(set->dpt[m]);
   int Nk;
   int tk;
-  double norm;
+  R norm;
   
     //fprintf(stderr, "Executing dpt.\n");  
 
-  X(next_power_of_2_exp)(k_end+1,&Nk,&tk);
+  Y(next_power_of_2_exp)(k_end+1,&Nk,&tk);
   norm = 2.0/(Nk<<1);
 
     //fprintf(stderr, "Norm = %e.\n", norm);  
@@ -1196,8 +1200,8 @@ void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double
         //fprintf(stderr, "x[%4d] = %e.\n", j, set->xc_slow[j]);  
     }
 
-    memset(set->result,0U,data->k_start*sizeof(double _Complex));
-    memcpy(&set->result[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
+    memset(set->result,0U,data->k_start*sizeof(C));
+    memcpy(&set->result[data->k_start],x, (size_t)((k_end - data->k_start + 1)) * sizeof(C));
 
     /*eval_sum_clenshaw(k_end, k_end, set->result, set->xc_slow,
       y, set->work, &data->alpha[1], &data->beta[1], &data->gamma[1],
@@ -1207,15 +1211,15 @@ void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double
   }
   else
   {
-    memset(set->temp,0U,data->k_start*sizeof(double _Complex));
-    memcpy(&set->temp[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
+    memset(set->temp,0U,data->k_start*sizeof(C));
+    memcpy(&set->temp[data->k_start],x, (size_t)((k_end - data->k_start + 1)) * sizeof(C));
 
     eval_sum_clenshaw_fast(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
       set->result, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
       data->gamma_m1);
 
-    fftw_execute_r2r(set->plans_dct2[tk-2],(double*)set->result,
-      (double*)set->result);
+    fftw_execute_r2r(set->plans_dct2[tk-2],(R*)set->result,
+      (R*)set->result);
 
     set->result[0] *= 0.5;
     for (j = 0; j < Nk; j++)
@@ -1223,11 +1227,11 @@ void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double
       set->result[j] *= norm;
     }
 
-    memcpy(y,set->result,(k_end+1)*sizeof(double _Complex));
+    memcpy(y,set->result, (size_t)((k_end + 1)) * sizeof(C));
   }
 }
 
-void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
+void fpt_trafo(fpt_set set, const int m, const C *x, C *y,
   const int k_end, const unsigned int flags)
 {
   /* Get transformation data. */
@@ -1264,8 +1268,8 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
   /** Loop counter */
   int k;
 
-  double _Complex *work_ptr;
-  const double _Complex *x_ptr;
+  C *work_ptr;
+  const C *x_ptr;
 
   /* Check, if slow transformation should be used due to small bandwidth. */
   if (k_end < FPT_BREAK_EVEN)
@@ -1275,7 +1279,7 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
     return;
   }
 
-  X(next_power_of_2_exp)(k_end,&Nk,&tk);
+  Y(next_power_of_2_exp)(k_end,&Nk,&tk);
   k_start_tilde = K_START_TILDE(data->k_start,Nk);
   k_end_tilde = K_END_TILDE(k_end,Nk);
 
@@ -1291,20 +1295,20 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
 {
     fftw_plan_with_nthreads(nthreads);
 #endif
-    plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
-      (double*)set->work, NULL, 2, 1, kinds, 0U);
+    plan = fftw_plan_many_r2r(1, &length, 2, (R*)set->work, NULL, 2, 1,
+      (R*)set->work, NULL, 2, 1, kinds, 0U);
 #ifdef _OPENMP
 }
 #endif
   }
 
   /* Initialize working arrays. */
-  memset(set->result,0U,2*Nk*sizeof(double _Complex));
+  memset(set->result,0U,2*Nk*sizeof(C));
 
   /* The first step. */
 
   /* Set the first 2*data->k_start coefficients to zero. */
-  memset(set->work,0U,2*data->k_start*sizeof(double _Complex));
+  memset(set->work,0U,2*data->k_start*sizeof(C));
 
   work_ptr = &set->work[2*data->k_start];
   x_ptr = x;
@@ -1316,7 +1320,7 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
   }
 
   /* Set the last 2*(set->N-1-k_end_tilde) coefficients to zero. */
-  memset(&set->work[2*(k_end_tilde+1)],0U,2*(Nk-1-k_end_tilde)*sizeof(double _Complex));
+  memset(&set->work[2*(k_end_tilde+1)],0U,2*(Nk-1-k_end_tilde)*sizeof(C));
 
   /* If k_end == Nk, use three-term recurrence to map last coefficient x_{Nk} to
    * x_{Nk-1} and x_{Nk-2}. */
@@ -1340,15 +1344,15 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
     for (l = firstl; l <= lastl; l++)
     {
       /* Copy vectors to multiply into working arrays zero-padded to twice the length. */
-      memcpy(set->vec3,&(set->work[(plength/2)*(4*l+2)]),(plength/2)*sizeof(double _Complex));
-      memcpy(set->vec4,&(set->work[(plength/2)*(4*l+3)]),(plength/2)*sizeof(double _Complex));
-      memset(&set->vec3[plength/2],0U,(plength/2)*sizeof(double _Complex));
-      memset(&set->vec4[plength/2],0U,(plength/2)*sizeof(double _Complex));
+      memcpy(set->vec3,&(set->work[(plength/2)*(4*l+2)]), (size_t)((plength / 2)) * sizeof(C));
+      memcpy(set->vec4,&(set->work[(plength/2)*(4*l+3)]), (size_t)((plength / 2)) * sizeof(C));
+      memset(&set->vec3[plength/2],0U,(plength/2)*sizeof(C));
+      memset(&set->vec4[plength/2],0U,(plength/2)*sizeof(C));
 
       /* Copy coefficients into first half. */
-      memcpy(&(set->work[(plength/2)*(4*l+2)]),&(set->work[(plength/2)*(4*l+1)]),(plength/2)*sizeof(double _Complex));
-      memset(&(set->work[(plength/2)*(4*l+1)]),0U,(plength/2)*sizeof(double _Complex));
-      memset(&(set->work[(plength/2)*(4*l+3)]),0U,(plength/2)*sizeof(double _Complex));
+      memcpy(&(set->work[(plength/2)*(4*l+2)]),&(set->work[(plength/2)*(4*l+1)]), (size_t)((plength / 2)) * sizeof(C));
+      memset(&(set->work[(plength/2)*(4*l+1)]),0U,(plength/2)*sizeof(C));
+      memset(&(set->work[(plength/2)*(4*l+3)]),0U,(plength/2)*sizeof(C));
 
       /* Get matrix U_{n,tau,l} */
       step = &(data->steps[tau][l]);
@@ -1406,8 +1410,8 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
         /* Set rest of vectors explicitely to zero */
         /*fprintf(stderr,"fpt_trafo: stabilizing: plength = %d, plength_stab = %d\n",
           plength, plength_stab);*/
-        memset(&set->vec3[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
-        memset(&set->vec4[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
+        memset(&set->vec3[plength/2],0U,(plength_stab-plength/2)*sizeof(C));
+        memset(&set->vec4[plength/2],0U,(plength_stab-plength/2)*sizeof(C));
 
         /* Multiply third and fourth polynomial with matrix U. */
         /* Check for symmetry. */
@@ -1499,7 +1503,7 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
   if (flags & FPT_FUNCTION_VALUES)
   {
     y[0] *= 2.0;
-    fftw_execute_r2r(plan,(double*)y,(double*)y);
+    fftw_execute_r2r(plan,(R*)y,(R*)y);
 #ifdef _OPENMP
     #pragma omp critical (nfft_omp_critical_fftw_plan)
 #endif
@@ -1511,16 +1515,16 @@ void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Compl
   }
 }
 
-void fpt_transposed_direct(fpt_set set, const int m, double _Complex *x,
-  double _Complex *y, const int k_end, const unsigned int flags)
+void fpt_transposed_direct(fpt_set set, const int m, C *x,
+  C *y, const int k_end, const unsigned int flags)
 {
   int j;
   fpt_data *data = &(set->dpt[m]);
   int Nk;
   int tk;
-  double norm;
+  R norm;
 
-  X(next_power_of_2_exp)(k_end+1,&Nk,&tk);
+  Y(next_power_of_2_exp)(k_end+1,&Nk,&tk);
   norm = 2.0/(Nk<<1);
 
   if (set->flags & FPT_NO_DIRECT_ALGORITHM)
@@ -1539,32 +1543,32 @@ void fpt_transposed_direct(fpt_set set, const int m, double _Complex *x,
       y, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
       data->gamma_m1);
 
-    memcpy(x,&set->result[data->k_start],(k_end-data->k_start+1)*
-      sizeof(double _Complex));
+    memcpy(x,&set->result[data->k_start], (size_t)((k_end - data->k_start + 1)) *
+      sizeof(C));
   }
   else
   {
-    memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
-    memset(&set->result[k_end+1],0U,(Nk-k_end-1)*sizeof(double _Complex));
+    memcpy(set->result,y, (size_t)((k_end + 1)) * sizeof(C));
+    memset(&set->result[k_end+1],0U,(Nk-k_end-1)*sizeof(C));
 
     for (j = 0; j < Nk; j++)
     {
       set->result[j] *= norm;
     }
 
-    fftw_execute_r2r(set->plans_dct3[tk-2],(double*)set->result,
-      (double*)set->result);
+    fftw_execute_r2r(set->plans_dct3[tk-2],(R*)set->result,
+      (R*)set->result);
 
     eval_sum_clenshaw_transposed(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
       set->result, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
       data->gamma_m1);
 
-    memcpy(x,&set->temp[data->k_start],(k_end-data->k_start+1)*sizeof(double _Complex));
+    memcpy(x,&set->temp[data->k_start], (size_t)((k_end - data->k_start + 1)) * sizeof(C));
   }
 }
 
-void fpt_transposed(fpt_set set, const int m, double _Complex *x,
-  double _Complex *y, const int k_end, const unsigned int flags)
+void fpt_transposed(fpt_set set, const int m, C *x,
+  C *y, const int k_end, const unsigned int flags)
 {
   /* Get transformation data. */
   fpt_data *data = &(set->dpt[m]);
@@ -1607,7 +1611,7 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
     return;
   }
 
-  X(next_power_of_2_exp)(k_end,&Nk,&tk);
+  Y(next_power_of_2_exp)(k_end,&Nk,&tk);
   k_start_tilde = K_START_TILDE(data->k_start,Nk);
   k_end_tilde = K_END_TILDE(k_end,Nk);
 
@@ -1625,12 +1629,12 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
 {
     fftw_plan_with_nthreads(nthreads);
 #endif
-    plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
-      (double*)set->work, NULL, 2, 1, kinds, 0U);
+    plan = fftw_plan_many_r2r(1, &length, 2, (R*)set->work, NULL, 2, 1,
+      (R*)set->work, NULL, 2, 1, kinds, 0U);
 #ifdef _OPENMP
 }
 #endif
-    fftw_execute_r2r(plan,(double*)y,(double*)set->result);
+    fftw_execute_r2r(plan,(R*)y,(R*)set->result);
 #ifdef _OPENMP
     #pragma omp critical (nfft_omp_critical_fftw_plan)
 #endif
@@ -1642,18 +1646,18 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
   }
   else
   {
-    memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
+    memcpy(set->result,y, (size_t)((k_end + 1)) * sizeof(C));
   }
 
   /* Initialize working arrays. */
-  memset(set->work,0U,2*Nk*sizeof(double _Complex));
+  memset(set->work,0U,2*Nk*sizeof(C));
 
   /* The last step is now the first step. */
   for (k = 0; k <= k_end; k++)
   {
     set->work[k] = data->gamma_m1*set->result[k];
   }
-  //memset(&set->work[k_end+1],0U,(Nk+1-k_end)*sizeof(double _Complex));
+  //memset(&set->work[k_end+1],0U,(Nk+1-k_end)*sizeof(C));
 
   set->work[Nk] = data->gamma_m1*(data->beta_0*set->result[0] +
     data->alpha_0*set->result[1]);
@@ -1664,11 +1668,11 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
   }
   if (k_end<Nk)
   {
-    memset(&set->work[k_end],0U,(Nk-k_end)*sizeof(double _Complex));
+    memset(&set->work[k_end],0U,(Nk-k_end)*sizeof(C));
   }
 
   /** Save copy of inpute data for stabilization steps. */
-  memcpy(set->result,set->work,2*Nk*sizeof(double _Complex));
+  memcpy(set->result,set->work, (size_t)(2 * Nk) * sizeof(C));
 
   /* Compute the remaining steps. */
   plength = Nk;
@@ -1683,11 +1687,11 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
     for (l = firstl; l <= lastl; l++)
     {
       /* Initialize second half of coefficient arrays with zeros. */
-      memcpy(set->vec3,&(set->work[(plength/2)*(4*l+0)]),plength*sizeof(double _Complex));
-      memcpy(set->vec4,&(set->work[(plength/2)*(4*l+2)]),plength*sizeof(double _Complex));
+      memcpy(set->vec3,&(set->work[(plength/2)*(4*l+0)]), (size_t)(plength) * sizeof(C));
+      memcpy(set->vec4,&(set->work[(plength/2)*(4*l+2)]), (size_t)(plength) * sizeof(C));
 
       memcpy(&set->work[(plength/2)*(4*l+1)],&(set->work[(plength/2)*(4*l+2)]),
-        (plength/2)*sizeof(double _Complex));
+        (size_t)((plength / 2)) * sizeof(C));
 
       /* Get matrix U_{n,tau,l} */
       step = &(data->steps[tau][l]);
@@ -1707,7 +1711,7 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
           fpt_do_step_t(set->vec3, set->vec4, step->a11[0], step->a12[0],
             step->a21[0], step->a22[0], step->g[0], tau, set);
         }
-        memcpy(&(set->vec3[plength/2]), set->vec4,(plength/2)*sizeof(double _Complex));
+        memcpy(&(set->vec3[plength/2]), set->vec4, (size_t)((plength / 2)) * sizeof(C));
 
         for (k = 0; k < plength; k++)
         {
@@ -1720,8 +1724,8 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
         plength_stab = step->Ns;
         t_stab = step->ts;
 
-        memcpy(set->vec3,set->result,plength_stab*sizeof(double _Complex));
-        memcpy(set->vec4,&(set->result[Nk]),plength_stab*sizeof(double _Complex));
+        memcpy(set->vec3,set->result, (size_t)(plength_stab) * sizeof(C));
+        memcpy(set->vec4,&(set->result[Nk]), (size_t)(plength_stab) * sizeof(C));
 
         /* Multiply third and fourth polynomial with matrix U. */
         if (set->flags & FPT_AL_SYMMETRY)
@@ -1748,7 +1752,7 @@ void fpt_transposed(fpt_set set, const int m, double _Complex *x,
               step->a21[0], step->a22[0], step->g[0], t_stab-1, set);
         }
 
-        memcpy(&(set->vec3[plength/2]),set->vec4,(plength/2)*sizeof(double _Complex));
+        memcpy(&(set->vec3[plength/2]),set->vec4, (size_t)((plength / 2)) * sizeof(C));
 
         for (k = 0; k < plength; k++)
         {
@@ -1801,7 +1805,7 @@ void fpt_finalize(fpt_set set)
       data->gammaN = NULL;
 
       /* Free precomputed data. */
-      k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
+      k_start_tilde = K_START_TILDE(data->k_start,Y(next_power_of_2)(data->k_start)
         /*set->N*/);
       N_tilde = N_TILDE(set->N);
       plength = 4;