From 423b3165ee91961b60805bebc110896925c6df3c Mon Sep 17 00:00:00 2001 From: "Xuanyao (Kelvin) Fong" Date: Mon, 6 Jun 2022 09:37:40 +0800 Subject: [PATCH] Add relative error check in adaptive time stepping algorithm for rk23, rk45dp and rk56 time steppers --- cuda/vecnorm.cu | 14 +++++++++ cuda/vecnorm.go | 18 +++++++++++ engine/rk23.go | 45 ++++++++++++++++++++++----- engine/rk45dp.go | 45 ++++++++++++++++++++++----- engine/rk56.go | 81 +++++++++++++++++++++++++++++++++++++----------- engine/run.go | 3 ++ 6 files changed, 174 insertions(+), 32 deletions(-) create mode 100644 cuda/vecnorm.cu create mode 100644 cuda/vecnorm.go diff --git a/cuda/vecnorm.cu b/cuda/vecnorm.cu new file mode 100644 index 000000000..92670f5dc --- /dev/null +++ b/cuda/vecnorm.cu @@ -0,0 +1,14 @@ +#include "float3.h" + +// dst = sqrt(ax*ax + ay*ay + az*az) +extern "C" __global__ void +vecnorm(float* __restrict__ dst, + float* __restrict__ ax, float* __restrict__ ay, float* __restrict__ az, + int N) { + + int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; + if (i < N) { + float3 A = {ax[i], ay[i], az[i]}; + dst[i] = sqrtf(dot(A, A)); + } +} diff --git a/cuda/vecnorm.go b/cuda/vecnorm.go new file mode 100644 index 000000000..25e01e998 --- /dev/null +++ b/cuda/vecnorm.go @@ -0,0 +1,18 @@ +package cuda + +import ( + "github.com/mumax/3/data" + "github.com/mumax/3/util" +) + +// dst = sqrt(dot(a, a)), +func VecNorm(dst *data.Slice, a *data.Slice) { + util.Argument(dst.NComp() == 1 && a.NComp() == 3) + util.Argument(dst.Len() == a.Len()) + + N := dst.Len() + cfg := make1DConf(N) + k_vecnorm_async(dst.DevPtr(0), + a.DevPtr(X), a.DevPtr(Y), a.DevPtr(Z), + N, cfg) +} diff --git a/engine/rk23.go b/engine/rk23.go index 6d0d38f64..a207a743a 100644 --- a/engine/rk23.go +++ b/engine/rk23.go @@ -87,13 +87,44 @@ func (rk *RK23) Step() { // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop - // step OK - setLastErr(err) - setMaxTorque(k4) - NSteps++ - Time = t0 + Dt_si - adaptDt(math.Pow(MaxErr/err, 1./3.)) - data.Copy(rk.k1, k4) // FSAL + // Passed absolute error. Check relative error... + errnorm := cuda.Buffer(1, size) + defer cuda.Recycle(errnorm) + cuda.VecNorm(errnorm, Err) + ddtnorm := cuda.Buffer(1, size) + defer cuda.Recycle(ddtnorm) + cuda.VecNorm(ddtnorm, k4) + maxdm := cuda.MaxVecNorm(k4) + fail := 0 + rlerr := float64(0.0) + if maxdm < MinSlope { // Only step using relerr if dmdt is big enough. Overcomes equilibrium problem + fail = 0 + } else { + cuda.Div(errnorm, errnorm, ddtnorm) //re-use errnorm + rlerr = float64(cuda.MaxAbs(errnorm)) + fail = 1 + } + if fail == 0 || RelErr <= 0.0 || rlerr < RelErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop + // step OK + setLastErr(err) + setMaxTorque(k4) + NSteps++ + Time = t0 + Dt_si + if fail == 0 { + adaptDt(math.Pow(MaxErr/err, 1./3.)) + } else { + adaptDt(math.Pow(RelErr/rlerr, 1./3.)) + } + data.Copy(rk.k1, k4) // FSAL + } else { + // undo bad step + //util.Println("Bad step at t=", t0, ", err=", err) + util.Assert(FixDt == 0) + Time = t0 + data.Copy(m, m0) + NUndone++ + adaptDt(math.Pow(RelErr/rlerr, 1./4.)) + } } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) diff --git a/engine/rk45dp.go b/engine/rk45dp.go index 4f59642f2..ac453cd4a 100644 --- a/engine/rk45dp.go +++ b/engine/rk45dp.go @@ -101,13 +101,44 @@ func (rk *RK45DP) Step() { // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop - // step OK - setLastErr(err) - setMaxTorque(k7) - NSteps++ - Time = t0 + Dt_si - adaptDt(math.Pow(MaxErr/err, 1./5.)) - data.Copy(rk.k1, k7) // FSAL + //Passed absolute error. Check relative error... + errnorm := cuda.Buffer(1, size) + defer cuda.Recycle(errnorm) + cuda.VecNorm(errnorm, Err) + ddtnorm := cuda.Buffer(1, size) + defer cuda.Recycle(ddtnorm) + cuda.VecNorm(ddtnorm, k7) + maxdm := cuda.MaxVecNorm(k7) + fail := 0 + rlerr := float64(0.0) + if maxdm < MinSlope { // Only step using relerr if dmdt is big enough. Overcomes equilibrium problem + fail = 0 + } else { + cuda.Div(errnorm, errnorm, ddtnorm) //re-use errnorm + rlerr = float64(cuda.MaxAbs(errnorm)) + fail = 1 + } + if fail == 0 || RelErr <= 0.0 || rlerr < RelErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop + // step OK + setLastErr(err) + setMaxTorque(k7) + NSteps++ + Time = t0 + Dt_si + if fail == 0 { + adaptDt(math.Pow(MaxErr/err, 1./5.)) + } else { + adaptDt(math.Pow(RelErr/rlerr, 1./5.)) + } + data.Copy(rk.k1, k7) // FSAL + } else { + // undo bad step + //util.Println("Bad step at t=", t0, ", err=", err) + util.Assert(FixDt == 0) + Time = t0 + data.Copy(m, m0) + NUndone++ + adaptDt(math.Pow(RelErr/rlerr, 1./6.)) + } } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) diff --git a/engine/rk56.go b/engine/rk56.go index 5bbe8738d..18009b1fb 100644 --- a/engine/rk56.go +++ b/engine/rk56.go @@ -8,6 +8,7 @@ import ( ) type RK56 struct { + k1 *data.Slice // torque at end of step is kept for beginning of next step } func (rk *RK56) Step() { @@ -19,14 +20,24 @@ func (rk *RK56) Step() { Dt_si = FixDt } + // upon resize: remove wrongly sized k1 + if rk.k1.Size() != m.Size() { + rk.Free() + } + + // first step ever: one-time k1 init and eval + if rk.k1 == nil { + rk.k1 = cuda.NewSlice(3, size) + torqueFn(rk.k1) + } + t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) - k1, k2, k3, k4, k5, k6, k7, k8 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) - defer cuda.Recycle(k1) + k2, k3, k4, k5, k6, k7, k8 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) @@ -39,73 +50,105 @@ func (rk *RK56) Step() { h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 - torqueFn(k1) + torqueFn(rk.k1) // stage 2 Time = t0 + (1./6.)*Dt_si - cuda.Madd2(m, m, k1, 1, (1./6.)*h) // m = m*1 + k1*h/6 + cuda.Madd2(m, m, rk.k1, 1, (1./6.)*h) // m = m*1 + k1*h/6 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (4./15.)*Dt_si - cuda.Madd3(m, m0, k1, k2, 1, (4./75.)*h, (16./75.)*h) + cuda.Madd3(m, m0, rk.k1, k2, 1, (4./75.)*h, (16./75.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (2./3.)*Dt_si - cuda.Madd4(m, m0, k1, k2, k3, 1, (5./6.)*h, (-8./3.)*h, (5./2.)*h) + cuda.Madd4(m, m0, rk.k1, k2, k3, 1, (5./6.)*h, (-8./3.)*h, (5./2.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (4./5.)*Dt_si - cuda.Madd5(m, m0, k1, k2, k3, k4, 1, (-8./5.)*h, (144./25.)*h, (-4.)*h, (16./25.)*h) + cuda.Madd5(m, m0, rk.k1, k2, k3, k4, 1, (-8./5.)*h, (144./25.)*h, (-4.)*h, (16./25.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si - cuda.Madd6(m, m0, k1, k2, k3, k4, k5, 1, (361./320.)*h, (-18./5.)*h, (407./128.)*h, (-11./80.)*h, (55./128.)*h) + cuda.Madd6(m, m0, rk.k1, k2, k3, k4, k5, 1, (361./320.)*h, (-18./5.)*h, (407./128.)*h, (-11./80.)*h, (55./128.)*h) M.normalize() torqueFn(k6) // stage 7 Time = t0 - cuda.Madd5(m, m0, k1, k3, k4, k5, 1, (-11./640.)*h, (11./256.)*h, (-11/160.)*h, (11./256.)*h) + cuda.Madd5(m, m0, rk.k1, k3, k4, k5, 1, (-11./640.)*h, (11./256.)*h, (-11/160.)*h, (11./256.)*h) M.normalize() torqueFn(k7) // stage 8 Time = t0 + (1.)*Dt_si - cuda.Madd7(m, m0, k1, k2, k3, k4, k5, k7, 1, (93./640.)*h, (-18./5.)*h, (803./256.)*h, (-11./160.)*h, (99./256.)*h, (1.)*h) + cuda.Madd7(m, m0, rk.k1, k2, k3, k4, k5, k7, 1, (93./640.)*h, (-18./5.)*h, (803./256.)*h, (-11./160.)*h, (99./256.)*h, (1.)*h) M.normalize() torqueFn(k8) // stage 9: 6th order solution Time = t0 + (1.)*Dt_si //madd6(m, m0, k1, k3, k4, k5, k6, 1, (31./384.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h) - cuda.Madd7(m, m0, k1, k3, k4, k5, k7, k8, 1, (7./1408.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h, (5./66.)*h) + cuda.Madd7(m, m0, rk.k1, k3, k4, k5, k7, k8, 1, (7./1408.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h, (5./66.)*h) M.normalize() torqueFn(k2) // re-use k2 // error estimate Err := cuda.Buffer(3, size) defer cuda.Recycle(Err) - cuda.Madd4(Err, k1, k6, k7, k8, (-5. / 66.), (-5. / 66.), (5. / 66.), (5. / 66.)) + cuda.Madd4(Err, rk.k1, k6, k7, k8, (-5. / 66.), (-5. / 66.), (5. / 66.), (5. / 66.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop - // step OK - setLastErr(err) - setMaxTorque(k2) - NSteps++ - Time = t0 + Dt_si - adaptDt(math.Pow(MaxErr/err, 1./6.)) + //Passed absolute error. Check relative error... + errnorm := cuda.Buffer(1, size) + defer cuda.Recycle(errnorm) + cuda.VecNorm(errnorm, Err) + ddtnorm := cuda.Buffer(1, size) + defer cuda.Recycle(ddtnorm) + cuda.VecNorm(ddtnorm, k2) + maxdm := cuda.MaxVecNorm(k2) + fail := 0 + rlerr := float64(0.0) + if maxdm < MinSlope { // Only step using relerr if dmdt is big enough. Overcomes equilibrium problem + fail = 0 + } else { + cuda.Div(errnorm, errnorm, ddtnorm) //re-use errnorm + rlerr = float64(cuda.MaxAbs(errnorm)) + fail = 1 + } + if fail == 0 || RelErr <= 0.0 || rlerr < RelErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop + // step OK + setLastErr(err) + setMaxTorque(k2) + NSteps++ + Time = t0 + Dt_si + if fail == 0 { + adaptDt(math.Pow(MaxErr/err, 1./6.)) + } else { + adaptDt(math.Pow(RelErr/rlerr, 1./6.)) + } + data.Copy(rk.k1, k2) // FSAL + } else { + // undo bad step + //util.Println("Bad step at t=", t0, ", err=", err) + util.Assert(FixDt == 0) + Time = t0 + data.Copy(m, m0) + NUndone++ + adaptDt(math.Pow(RelErr/rlerr, 1./7.)) + } } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) @@ -118,4 +161,6 @@ func (rk *RK56) Step() { } func (rk *RK56) Free() { + rk.k1.Free() + rk.k1 = nil } diff --git a/engine/run.go b/engine/run.go index 6cff99b23..ebdf788a4 100644 --- a/engine/run.go +++ b/engine/run.go @@ -20,6 +20,8 @@ var ( Dt_si float64 = 1e-15 // time step = dt_si (seconds) *dt_mul, which should be nice float32 MinDt, MaxDt float64 // minimum and maximum time step MaxErr float64 = 1e-5 // maximum error/step + RelErr float64 = 1e-4 // maximum relative error/step + MinSlope float64 = 1e8 // minimum delta per unit time before using relative error to step Headroom float64 = 0.8 // solver headroom, (Gustafsson, 1992, Control of Error and Convergence in ODE Solvers) LastErr, PeakErr float64 // error of last step, highest error ever LastTorque float64 // maxTorque of last time step @@ -39,6 +41,7 @@ func init() { DeclVar("MinDt", &MinDt, "Minimum time step the solver can take (s)") DeclVar("MaxDt", &MaxDt, "Maximum time step the solver can take (s)") DeclVar("MaxErr", &MaxErr, "Maximum error per step the solver can tolerate (default = 1e-5)") + DeclVar("RelErr", &RelErr, "Maximum relative error per step the solver can tolerate (default = 1e-4)") DeclVar("Headroom", &Headroom, "Solver headroom (default = 0.8)") DeclVar("FixDt", &FixDt, "Set a fixed time step, 0 disables fixed step (which is the default)") DeclFunc("Exit", Exit, "Exit from the program")