diff --git a/cuda/copypadmul2_wrapper.go b/cuda/copypadmul2_wrapper.go index 4bf715ef2..16e96d596 100644 --- a/cuda/copypadmul2_wrapper.go +++ b/cuda/copypadmul2_wrapper.go @@ -5,54 +5,54 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for copypadmul2 kernel var copypadmul2_code cu.Function // Stores the arguments for copypadmul2 kernel invocation -type copypadmul2_args_t struct { - arg_dst unsafe.Pointer - arg_Dx int - arg_Dy int - arg_Dz int - arg_src unsafe.Pointer - arg_Sx int - arg_Sy int - arg_Sz int - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_vol unsafe.Pointer - argptr [11]unsafe.Pointer +type copypadmul2_args_t struct{ + arg_dst unsafe.Pointer + arg_Dx int + arg_Dy int + arg_Dz int + arg_src unsafe.Pointer + arg_Sx int + arg_Sy int + arg_Sz int + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_vol unsafe.Pointer + argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for copypadmul2 kernel invocation var copypadmul2_args copypadmul2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - copypadmul2_args.argptr[0] = unsafe.Pointer(©padmul2_args.arg_dst) - copypadmul2_args.argptr[1] = unsafe.Pointer(©padmul2_args.arg_Dx) - copypadmul2_args.argptr[2] = unsafe.Pointer(©padmul2_args.arg_Dy) - copypadmul2_args.argptr[3] = unsafe.Pointer(©padmul2_args.arg_Dz) - copypadmul2_args.argptr[4] = unsafe.Pointer(©padmul2_args.arg_src) - copypadmul2_args.argptr[5] = unsafe.Pointer(©padmul2_args.arg_Sx) - copypadmul2_args.argptr[6] = unsafe.Pointer(©padmul2_args.arg_Sy) - copypadmul2_args.argptr[7] = unsafe.Pointer(©padmul2_args.arg_Sz) - copypadmul2_args.argptr[8] = unsafe.Pointer(©padmul2_args.arg_Ms_) - copypadmul2_args.argptr[9] = unsafe.Pointer(©padmul2_args.arg_Ms_mul) - copypadmul2_args.argptr[10] = unsafe.Pointer(©padmul2_args.arg_vol) -} + copypadmul2_args.argptr[0] = unsafe.Pointer(©padmul2_args.arg_dst) + copypadmul2_args.argptr[1] = unsafe.Pointer(©padmul2_args.arg_Dx) + copypadmul2_args.argptr[2] = unsafe.Pointer(©padmul2_args.arg_Dy) + copypadmul2_args.argptr[3] = unsafe.Pointer(©padmul2_args.arg_Dz) + copypadmul2_args.argptr[4] = unsafe.Pointer(©padmul2_args.arg_src) + copypadmul2_args.argptr[5] = unsafe.Pointer(©padmul2_args.arg_Sx) + copypadmul2_args.argptr[6] = unsafe.Pointer(©padmul2_args.arg_Sy) + copypadmul2_args.argptr[7] = unsafe.Pointer(©padmul2_args.arg_Sz) + copypadmul2_args.argptr[8] = unsafe.Pointer(©padmul2_args.arg_Ms_) + copypadmul2_args.argptr[9] = unsafe.Pointer(©padmul2_args.arg_Ms_mul) + copypadmul2_args.argptr[10] = unsafe.Pointer(©padmul2_args.arg_vol) + } // Wrapper for copypadmul2 CUDA kernel, asynchronous. -func k_copypadmul2_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Ms_ unsafe.Pointer, Ms_mul float32, vol unsafe.Pointer, cfg *config) { - if Synchronous { // debug +func k_copypadmul2_async ( dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Ms_ unsafe.Pointer, Ms_mul float32, vol unsafe.Pointer, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("copypadmul2") } @@ -60,47 +60,48 @@ func k_copypadmul2_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe. copypadmul2_args.Lock() defer copypadmul2_args.Unlock() - if copypadmul2_code == 0 { + if copypadmul2_code == 0{ copypadmul2_code = fatbinLoad(copypadmul2_map, "copypadmul2") } - copypadmul2_args.arg_dst = dst - copypadmul2_args.arg_Dx = Dx - copypadmul2_args.arg_Dy = Dy - copypadmul2_args.arg_Dz = Dz - copypadmul2_args.arg_src = src - copypadmul2_args.arg_Sx = Sx - copypadmul2_args.arg_Sy = Sy - copypadmul2_args.arg_Sz = Sz - copypadmul2_args.arg_Ms_ = Ms_ - copypadmul2_args.arg_Ms_mul = Ms_mul - copypadmul2_args.arg_vol = vol + copypadmul2_args.arg_dst = dst + copypadmul2_args.arg_Dx = Dx + copypadmul2_args.arg_Dy = Dy + copypadmul2_args.arg_Dz = Dz + copypadmul2_args.arg_src = src + copypadmul2_args.arg_Sx = Sx + copypadmul2_args.arg_Sy = Sy + copypadmul2_args.arg_Sz = Sz + copypadmul2_args.arg_Ms_ = Ms_ + copypadmul2_args.arg_Ms_mul = Ms_mul + copypadmul2_args.arg_vol = vol + args := copypadmul2_args.argptr[:] cu.LaunchKernel(copypadmul2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("copypadmul2") } } // maps compute capability on PTX code for copypadmul2 kernel. -var copypadmul2_map = map[int]string{0: "", - 30: copypadmul2_ptx_30, - 35: copypadmul2_ptx_35, - 37: copypadmul2_ptx_37, - 50: copypadmul2_ptx_50, - 52: copypadmul2_ptx_52, - 53: copypadmul2_ptx_53, - 60: copypadmul2_ptx_60, - 61: copypadmul2_ptx_61, - 70: copypadmul2_ptx_70, - 75: copypadmul2_ptx_75} +var copypadmul2_map = map[int]string{ 0: "" , +30: copypadmul2_ptx_30 , +35: copypadmul2_ptx_35 , +37: copypadmul2_ptx_37 , +50: copypadmul2_ptx_50 , +52: copypadmul2_ptx_52 , +53: copypadmul2_ptx_53 , +60: copypadmul2_ptx_60 , +61: copypadmul2_ptx_61 , +70: copypadmul2_ptx_70 , +75: copypadmul2_ptx_75 } // copypadmul2 PTX code for various compute capabilities. -const ( - copypadmul2_ptx_30 = ` +const( + copypadmul2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -203,7 +204,7 @@ BB0_6: ` - copypadmul2_ptx_35 = ` + copypadmul2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -306,7 +307,7 @@ BB0_6: ` - copypadmul2_ptx_37 = ` + copypadmul2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -409,7 +410,7 @@ BB0_6: ` - copypadmul2_ptx_50 = ` + copypadmul2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -512,7 +513,7 @@ BB0_6: ` - copypadmul2_ptx_52 = ` + copypadmul2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -615,7 +616,7 @@ BB0_6: ` - copypadmul2_ptx_53 = ` + copypadmul2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -718,7 +719,7 @@ BB0_6: ` - copypadmul2_ptx_60 = ` + copypadmul2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -821,7 +822,7 @@ BB0_6: ` - copypadmul2_ptx_61 = ` + copypadmul2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -924,7 +925,7 @@ BB0_6: ` - copypadmul2_ptx_70 = ` + copypadmul2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1027,7 +1028,7 @@ BB0_6: ` - copypadmul2_ptx_75 = ` + copypadmul2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1130,4 +1131,4 @@ BB0_6: ` -) + ) diff --git a/cuda/copyunpad_wrapper.go b/cuda/copyunpad_wrapper.go index f6134fd20..a3597898a 100644 --- a/cuda/copyunpad_wrapper.go +++ b/cuda/copyunpad_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for copyunpad kernel var copyunpad_code cu.Function // Stores the arguments for copyunpad kernel invocation -type copyunpad_args_t struct { - arg_dst unsafe.Pointer - arg_Dx int - arg_Dy int - arg_Dz int - arg_src unsafe.Pointer - arg_Sx int - arg_Sy int - arg_Sz int - argptr [8]unsafe.Pointer +type copyunpad_args_t struct{ + arg_dst unsafe.Pointer + arg_Dx int + arg_Dy int + arg_Dz int + arg_src unsafe.Pointer + arg_Sx int + arg_Sy int + arg_Sz int + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for copyunpad kernel invocation var copyunpad_args copyunpad_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - copyunpad_args.argptr[0] = unsafe.Pointer(©unpad_args.arg_dst) - copyunpad_args.argptr[1] = unsafe.Pointer(©unpad_args.arg_Dx) - copyunpad_args.argptr[2] = unsafe.Pointer(©unpad_args.arg_Dy) - copyunpad_args.argptr[3] = unsafe.Pointer(©unpad_args.arg_Dz) - copyunpad_args.argptr[4] = unsafe.Pointer(©unpad_args.arg_src) - copyunpad_args.argptr[5] = unsafe.Pointer(©unpad_args.arg_Sx) - copyunpad_args.argptr[6] = unsafe.Pointer(©unpad_args.arg_Sy) - copyunpad_args.argptr[7] = unsafe.Pointer(©unpad_args.arg_Sz) -} + copyunpad_args.argptr[0] = unsafe.Pointer(©unpad_args.arg_dst) + copyunpad_args.argptr[1] = unsafe.Pointer(©unpad_args.arg_Dx) + copyunpad_args.argptr[2] = unsafe.Pointer(©unpad_args.arg_Dy) + copyunpad_args.argptr[3] = unsafe.Pointer(©unpad_args.arg_Dz) + copyunpad_args.argptr[4] = unsafe.Pointer(©unpad_args.arg_src) + copyunpad_args.argptr[5] = unsafe.Pointer(©unpad_args.arg_Sx) + copyunpad_args.argptr[6] = unsafe.Pointer(©unpad_args.arg_Sy) + copyunpad_args.argptr[7] = unsafe.Pointer(©unpad_args.arg_Sz) + } // Wrapper for copyunpad CUDA kernel, asynchronous. -func k_copyunpad_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, cfg *config) { - if Synchronous { // debug +func k_copyunpad_async ( dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("copyunpad") } @@ -54,44 +54,45 @@ func k_copyunpad_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Po copyunpad_args.Lock() defer copyunpad_args.Unlock() - if copyunpad_code == 0 { + if copyunpad_code == 0{ copyunpad_code = fatbinLoad(copyunpad_map, "copyunpad") } - copyunpad_args.arg_dst = dst - copyunpad_args.arg_Dx = Dx - copyunpad_args.arg_Dy = Dy - copyunpad_args.arg_Dz = Dz - copyunpad_args.arg_src = src - copyunpad_args.arg_Sx = Sx - copyunpad_args.arg_Sy = Sy - copyunpad_args.arg_Sz = Sz + copyunpad_args.arg_dst = dst + copyunpad_args.arg_Dx = Dx + copyunpad_args.arg_Dy = Dy + copyunpad_args.arg_Dz = Dz + copyunpad_args.arg_src = src + copyunpad_args.arg_Sx = Sx + copyunpad_args.arg_Sy = Sy + copyunpad_args.arg_Sz = Sz + args := copyunpad_args.argptr[:] cu.LaunchKernel(copyunpad_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("copyunpad") } } // maps compute capability on PTX code for copyunpad kernel. -var copyunpad_map = map[int]string{0: "", - 30: copyunpad_ptx_30, - 35: copyunpad_ptx_35, - 37: copyunpad_ptx_37, - 50: copyunpad_ptx_50, - 52: copyunpad_ptx_52, - 53: copyunpad_ptx_53, - 60: copyunpad_ptx_60, - 61: copyunpad_ptx_61, - 70: copyunpad_ptx_70, - 75: copyunpad_ptx_75} +var copyunpad_map = map[int]string{ 0: "" , +30: copyunpad_ptx_30 , +35: copyunpad_ptx_35 , +37: copyunpad_ptx_37 , +50: copyunpad_ptx_50 , +52: copyunpad_ptx_52 , +53: copyunpad_ptx_53 , +60: copyunpad_ptx_60 , +61: copyunpad_ptx_61 , +70: copyunpad_ptx_70 , +75: copyunpad_ptx_75 } // copyunpad PTX code for various compute capabilities. -const ( - copyunpad_ptx_30 = ` +const( + copyunpad_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -162,7 +163,7 @@ BB0_2: ` - copyunpad_ptx_35 = ` + copyunpad_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -233,7 +234,7 @@ BB0_2: ` - copyunpad_ptx_37 = ` + copyunpad_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -304,7 +305,7 @@ BB0_2: ` - copyunpad_ptx_50 = ` + copyunpad_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -375,7 +376,7 @@ BB0_2: ` - copyunpad_ptx_52 = ` + copyunpad_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -446,7 +447,7 @@ BB0_2: ` - copyunpad_ptx_53 = ` + copyunpad_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -517,7 +518,7 @@ BB0_2: ` - copyunpad_ptx_60 = ` + copyunpad_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -588,7 +589,7 @@ BB0_2: ` - copyunpad_ptx_61 = ` + copyunpad_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -659,7 +660,7 @@ BB0_2: ` - copyunpad_ptx_70 = ` + copyunpad_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -730,7 +731,7 @@ BB0_2: ` - copyunpad_ptx_75 = ` + copyunpad_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -801,4 +802,4 @@ BB0_2: ` -) + ) diff --git a/cuda/crop_wrapper.go b/cuda/crop_wrapper.go index efe1ccff1..c9bb409e5 100644 --- a/cuda/crop_wrapper.go +++ b/cuda/crop_wrapper.go @@ -5,54 +5,54 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for crop kernel var crop_code cu.Function // Stores the arguments for crop kernel invocation -type crop_args_t struct { - arg_dst unsafe.Pointer - arg_Dx int - arg_Dy int - arg_Dz int - arg_src unsafe.Pointer - arg_Sx int - arg_Sy int - arg_Sz int - arg_Offx int - arg_Offy int - arg_Offz int - argptr [11]unsafe.Pointer +type crop_args_t struct{ + arg_dst unsafe.Pointer + arg_Dx int + arg_Dy int + arg_Dz int + arg_src unsafe.Pointer + arg_Sx int + arg_Sy int + arg_Sz int + arg_Offx int + arg_Offy int + arg_Offz int + argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for crop kernel invocation var crop_args crop_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - crop_args.argptr[0] = unsafe.Pointer(&crop_args.arg_dst) - crop_args.argptr[1] = unsafe.Pointer(&crop_args.arg_Dx) - crop_args.argptr[2] = unsafe.Pointer(&crop_args.arg_Dy) - crop_args.argptr[3] = unsafe.Pointer(&crop_args.arg_Dz) - crop_args.argptr[4] = unsafe.Pointer(&crop_args.arg_src) - crop_args.argptr[5] = unsafe.Pointer(&crop_args.arg_Sx) - crop_args.argptr[6] = unsafe.Pointer(&crop_args.arg_Sy) - crop_args.argptr[7] = unsafe.Pointer(&crop_args.arg_Sz) - crop_args.argptr[8] = unsafe.Pointer(&crop_args.arg_Offx) - crop_args.argptr[9] = unsafe.Pointer(&crop_args.arg_Offy) - crop_args.argptr[10] = unsafe.Pointer(&crop_args.arg_Offz) -} + crop_args.argptr[0] = unsafe.Pointer(&crop_args.arg_dst) + crop_args.argptr[1] = unsafe.Pointer(&crop_args.arg_Dx) + crop_args.argptr[2] = unsafe.Pointer(&crop_args.arg_Dy) + crop_args.argptr[3] = unsafe.Pointer(&crop_args.arg_Dz) + crop_args.argptr[4] = unsafe.Pointer(&crop_args.arg_src) + crop_args.argptr[5] = unsafe.Pointer(&crop_args.arg_Sx) + crop_args.argptr[6] = unsafe.Pointer(&crop_args.arg_Sy) + crop_args.argptr[7] = unsafe.Pointer(&crop_args.arg_Sz) + crop_args.argptr[8] = unsafe.Pointer(&crop_args.arg_Offx) + crop_args.argptr[9] = unsafe.Pointer(&crop_args.arg_Offy) + crop_args.argptr[10] = unsafe.Pointer(&crop_args.arg_Offz) + } // Wrapper for crop CUDA kernel, asynchronous. -func k_crop_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Offx int, Offy int, Offz int, cfg *config) { - if Synchronous { // debug +func k_crop_async ( dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, Offx int, Offy int, Offz int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("crop") } @@ -60,47 +60,48 @@ func k_crop_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer crop_args.Lock() defer crop_args.Unlock() - if crop_code == 0 { + if crop_code == 0{ crop_code = fatbinLoad(crop_map, "crop") } - crop_args.arg_dst = dst - crop_args.arg_Dx = Dx - crop_args.arg_Dy = Dy - crop_args.arg_Dz = Dz - crop_args.arg_src = src - crop_args.arg_Sx = Sx - crop_args.arg_Sy = Sy - crop_args.arg_Sz = Sz - crop_args.arg_Offx = Offx - crop_args.arg_Offy = Offy - crop_args.arg_Offz = Offz + crop_args.arg_dst = dst + crop_args.arg_Dx = Dx + crop_args.arg_Dy = Dy + crop_args.arg_Dz = Dz + crop_args.arg_src = src + crop_args.arg_Sx = Sx + crop_args.arg_Sy = Sy + crop_args.arg_Sz = Sz + crop_args.arg_Offx = Offx + crop_args.arg_Offy = Offy + crop_args.arg_Offz = Offz + args := crop_args.argptr[:] cu.LaunchKernel(crop_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("crop") } } // maps compute capability on PTX code for crop kernel. -var crop_map = map[int]string{0: "", - 30: crop_ptx_30, - 35: crop_ptx_35, - 37: crop_ptx_37, - 50: crop_ptx_50, - 52: crop_ptx_52, - 53: crop_ptx_53, - 60: crop_ptx_60, - 61: crop_ptx_61, - 70: crop_ptx_70, - 75: crop_ptx_75} +var crop_map = map[int]string{ 0: "" , +30: crop_ptx_30 , +35: crop_ptx_35 , +37: crop_ptx_37 , +50: crop_ptx_50 , +52: crop_ptx_52 , +53: crop_ptx_53 , +60: crop_ptx_60 , +61: crop_ptx_61 , +70: crop_ptx_70 , +75: crop_ptx_75 } // crop PTX code for various compute capabilities. -const ( - crop_ptx_30 = ` +const( + crop_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -180,7 +181,7 @@ BB0_2: ` - crop_ptx_35 = ` + crop_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -260,7 +261,7 @@ BB0_2: ` - crop_ptx_37 = ` + crop_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -340,7 +341,7 @@ BB0_2: ` - crop_ptx_50 = ` + crop_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -420,7 +421,7 @@ BB0_2: ` - crop_ptx_52 = ` + crop_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -500,7 +501,7 @@ BB0_2: ` - crop_ptx_53 = ` + crop_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -580,7 +581,7 @@ BB0_2: ` - crop_ptx_60 = ` + crop_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -660,7 +661,7 @@ BB0_2: ` - crop_ptx_61 = ` + crop_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -740,7 +741,7 @@ BB0_2: ` - crop_ptx_70 = ` + crop_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -820,7 +821,7 @@ BB0_2: ` - crop_ptx_75 = ` + crop_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -900,4 +901,4 @@ BB0_2: ` -) + ) diff --git a/cuda/crossproduct_wrapper.go b/cuda/crossproduct_wrapper.go index 10c242a8a..910878b6f 100644 --- a/cuda/crossproduct_wrapper.go +++ b/cuda/crossproduct_wrapper.go @@ -5,52 +5,52 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for crossproduct kernel var crossproduct_code cu.Function // Stores the arguments for crossproduct kernel invocation -type crossproduct_args_t struct { - arg_dstx unsafe.Pointer - arg_dsty unsafe.Pointer - arg_dstz unsafe.Pointer - arg_ax unsafe.Pointer - arg_ay unsafe.Pointer - arg_az unsafe.Pointer - arg_bx unsafe.Pointer - arg_by unsafe.Pointer - arg_bz unsafe.Pointer - arg_N int - argptr [10]unsafe.Pointer +type crossproduct_args_t struct{ + arg_dstx unsafe.Pointer + arg_dsty unsafe.Pointer + arg_dstz unsafe.Pointer + arg_ax unsafe.Pointer + arg_ay unsafe.Pointer + arg_az unsafe.Pointer + arg_bx unsafe.Pointer + arg_by unsafe.Pointer + arg_bz unsafe.Pointer + arg_N int + argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for crossproduct kernel invocation var crossproduct_args crossproduct_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - crossproduct_args.argptr[0] = unsafe.Pointer(&crossproduct_args.arg_dstx) - crossproduct_args.argptr[1] = unsafe.Pointer(&crossproduct_args.arg_dsty) - crossproduct_args.argptr[2] = unsafe.Pointer(&crossproduct_args.arg_dstz) - crossproduct_args.argptr[3] = unsafe.Pointer(&crossproduct_args.arg_ax) - crossproduct_args.argptr[4] = unsafe.Pointer(&crossproduct_args.arg_ay) - crossproduct_args.argptr[5] = unsafe.Pointer(&crossproduct_args.arg_az) - crossproduct_args.argptr[6] = unsafe.Pointer(&crossproduct_args.arg_bx) - crossproduct_args.argptr[7] = unsafe.Pointer(&crossproduct_args.arg_by) - crossproduct_args.argptr[8] = unsafe.Pointer(&crossproduct_args.arg_bz) - crossproduct_args.argptr[9] = unsafe.Pointer(&crossproduct_args.arg_N) -} + crossproduct_args.argptr[0] = unsafe.Pointer(&crossproduct_args.arg_dstx) + crossproduct_args.argptr[1] = unsafe.Pointer(&crossproduct_args.arg_dsty) + crossproduct_args.argptr[2] = unsafe.Pointer(&crossproduct_args.arg_dstz) + crossproduct_args.argptr[3] = unsafe.Pointer(&crossproduct_args.arg_ax) + crossproduct_args.argptr[4] = unsafe.Pointer(&crossproduct_args.arg_ay) + crossproduct_args.argptr[5] = unsafe.Pointer(&crossproduct_args.arg_az) + crossproduct_args.argptr[6] = unsafe.Pointer(&crossproduct_args.arg_bx) + crossproduct_args.argptr[7] = unsafe.Pointer(&crossproduct_args.arg_by) + crossproduct_args.argptr[8] = unsafe.Pointer(&crossproduct_args.arg_bz) + crossproduct_args.argptr[9] = unsafe.Pointer(&crossproduct_args.arg_N) + } // Wrapper for crossproduct CUDA kernel, asynchronous. -func k_crossproduct_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_crossproduct_async ( dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("crossproduct") } @@ -58,46 +58,47 @@ func k_crossproduct_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe. crossproduct_args.Lock() defer crossproduct_args.Unlock() - if crossproduct_code == 0 { + if crossproduct_code == 0{ crossproduct_code = fatbinLoad(crossproduct_map, "crossproduct") } - crossproduct_args.arg_dstx = dstx - crossproduct_args.arg_dsty = dsty - crossproduct_args.arg_dstz = dstz - crossproduct_args.arg_ax = ax - crossproduct_args.arg_ay = ay - crossproduct_args.arg_az = az - crossproduct_args.arg_bx = bx - crossproduct_args.arg_by = by - crossproduct_args.arg_bz = bz - crossproduct_args.arg_N = N + crossproduct_args.arg_dstx = dstx + crossproduct_args.arg_dsty = dsty + crossproduct_args.arg_dstz = dstz + crossproduct_args.arg_ax = ax + crossproduct_args.arg_ay = ay + crossproduct_args.arg_az = az + crossproduct_args.arg_bx = bx + crossproduct_args.arg_by = by + crossproduct_args.arg_bz = bz + crossproduct_args.arg_N = N + args := crossproduct_args.argptr[:] cu.LaunchKernel(crossproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("crossproduct") } } // maps compute capability on PTX code for crossproduct kernel. -var crossproduct_map = map[int]string{0: "", - 30: crossproduct_ptx_30, - 35: crossproduct_ptx_35, - 37: crossproduct_ptx_37, - 50: crossproduct_ptx_50, - 52: crossproduct_ptx_52, - 53: crossproduct_ptx_53, - 60: crossproduct_ptx_60, - 61: crossproduct_ptx_61, - 70: crossproduct_ptx_70, - 75: crossproduct_ptx_75} +var crossproduct_map = map[int]string{ 0: "" , +30: crossproduct_ptx_30 , +35: crossproduct_ptx_35 , +37: crossproduct_ptx_37 , +50: crossproduct_ptx_50 , +52: crossproduct_ptx_52 , +53: crossproduct_ptx_53 , +60: crossproduct_ptx_60 , +61: crossproduct_ptx_61 , +70: crossproduct_ptx_70 , +75: crossproduct_ptx_75 } // crossproduct PTX code for various compute capabilities. -const ( - crossproduct_ptx_30 = ` +const( + crossproduct_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -187,7 +188,7 @@ BB0_2: ` - crossproduct_ptx_35 = ` + crossproduct_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -277,7 +278,7 @@ BB0_2: ` - crossproduct_ptx_37 = ` + crossproduct_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -367,7 +368,7 @@ BB0_2: ` - crossproduct_ptx_50 = ` + crossproduct_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -457,7 +458,7 @@ BB0_2: ` - crossproduct_ptx_52 = ` + crossproduct_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -547,7 +548,7 @@ BB0_2: ` - crossproduct_ptx_53 = ` + crossproduct_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -637,7 +638,7 @@ BB0_2: ` - crossproduct_ptx_60 = ` + crossproduct_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -727,7 +728,7 @@ BB0_2: ` - crossproduct_ptx_61 = ` + crossproduct_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -817,7 +818,7 @@ BB0_2: ` - crossproduct_ptx_70 = ` + crossproduct_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -907,7 +908,7 @@ BB0_2: ` - crossproduct_ptx_75 = ` + crossproduct_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -997,4 +998,4 @@ BB0_2: ` -) + ) diff --git a/cuda/cubicanisotropy2_wrapper.go b/cuda/cubicanisotropy2_wrapper.go index e8fab314f..ab70142b8 100644 --- a/cuda/cubicanisotropy2_wrapper.go +++ b/cuda/cubicanisotropy2_wrapper.go @@ -5,86 +5,86 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for addcubicanisotropy2 kernel var addcubicanisotropy2_code cu.Function // Stores the arguments for addcubicanisotropy2 kernel invocation -type addcubicanisotropy2_args_t struct { - arg_Bx unsafe.Pointer - arg_By unsafe.Pointer - arg_Bz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_k1_ unsafe.Pointer - arg_k1_mul float32 - arg_k2_ unsafe.Pointer - arg_k2_mul float32 - arg_k3_ unsafe.Pointer - arg_k3_mul float32 - arg_c1x_ unsafe.Pointer - arg_c1x_mul float32 - arg_c1y_ unsafe.Pointer - arg_c1y_mul float32 - arg_c1z_ unsafe.Pointer - arg_c1z_mul float32 - arg_c2x_ unsafe.Pointer - arg_c2x_mul float32 - arg_c2y_ unsafe.Pointer - arg_c2y_mul float32 - arg_c2z_ unsafe.Pointer - arg_c2z_mul float32 - arg_N int - argptr [27]unsafe.Pointer +type addcubicanisotropy2_args_t struct{ + arg_Bx unsafe.Pointer + arg_By unsafe.Pointer + arg_Bz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_k1_ unsafe.Pointer + arg_k1_mul float32 + arg_k2_ unsafe.Pointer + arg_k2_mul float32 + arg_k3_ unsafe.Pointer + arg_k3_mul float32 + arg_c1x_ unsafe.Pointer + arg_c1x_mul float32 + arg_c1y_ unsafe.Pointer + arg_c1y_mul float32 + arg_c1z_ unsafe.Pointer + arg_c1z_mul float32 + arg_c2x_ unsafe.Pointer + arg_c2x_mul float32 + arg_c2y_ unsafe.Pointer + arg_c2y_mul float32 + arg_c2z_ unsafe.Pointer + arg_c2z_mul float32 + arg_N int + argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addcubicanisotropy2 kernel invocation var addcubicanisotropy2_args addcubicanisotropy2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - addcubicanisotropy2_args.argptr[0] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bx) - addcubicanisotropy2_args.argptr[1] = unsafe.Pointer(&addcubicanisotropy2_args.arg_By) - addcubicanisotropy2_args.argptr[2] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bz) - addcubicanisotropy2_args.argptr[3] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mx) - addcubicanisotropy2_args.argptr[4] = unsafe.Pointer(&addcubicanisotropy2_args.arg_my) - addcubicanisotropy2_args.argptr[5] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mz) - addcubicanisotropy2_args.argptr[6] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_) - addcubicanisotropy2_args.argptr[7] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_mul) - addcubicanisotropy2_args.argptr[8] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_) - addcubicanisotropy2_args.argptr[9] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_mul) - addcubicanisotropy2_args.argptr[10] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_) - addcubicanisotropy2_args.argptr[11] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_mul) - addcubicanisotropy2_args.argptr[12] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_) - addcubicanisotropy2_args.argptr[13] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_mul) - addcubicanisotropy2_args.argptr[14] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_) - addcubicanisotropy2_args.argptr[15] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_mul) - addcubicanisotropy2_args.argptr[16] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_) - addcubicanisotropy2_args.argptr[17] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_mul) - addcubicanisotropy2_args.argptr[18] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_) - addcubicanisotropy2_args.argptr[19] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_mul) - addcubicanisotropy2_args.argptr[20] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_) - addcubicanisotropy2_args.argptr[21] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_mul) - addcubicanisotropy2_args.argptr[22] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_) - addcubicanisotropy2_args.argptr[23] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_mul) - addcubicanisotropy2_args.argptr[24] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_) - addcubicanisotropy2_args.argptr[25] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_mul) - addcubicanisotropy2_args.argptr[26] = unsafe.Pointer(&addcubicanisotropy2_args.arg_N) -} + addcubicanisotropy2_args.argptr[0] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bx) + addcubicanisotropy2_args.argptr[1] = unsafe.Pointer(&addcubicanisotropy2_args.arg_By) + addcubicanisotropy2_args.argptr[2] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bz) + addcubicanisotropy2_args.argptr[3] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mx) + addcubicanisotropy2_args.argptr[4] = unsafe.Pointer(&addcubicanisotropy2_args.arg_my) + addcubicanisotropy2_args.argptr[5] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mz) + addcubicanisotropy2_args.argptr[6] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_) + addcubicanisotropy2_args.argptr[7] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_mul) + addcubicanisotropy2_args.argptr[8] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_) + addcubicanisotropy2_args.argptr[9] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_mul) + addcubicanisotropy2_args.argptr[10] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_) + addcubicanisotropy2_args.argptr[11] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_mul) + addcubicanisotropy2_args.argptr[12] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_) + addcubicanisotropy2_args.argptr[13] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_mul) + addcubicanisotropy2_args.argptr[14] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_) + addcubicanisotropy2_args.argptr[15] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_mul) + addcubicanisotropy2_args.argptr[16] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_) + addcubicanisotropy2_args.argptr[17] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_mul) + addcubicanisotropy2_args.argptr[18] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_) + addcubicanisotropy2_args.argptr[19] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_mul) + addcubicanisotropy2_args.argptr[20] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_) + addcubicanisotropy2_args.argptr[21] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_mul) + addcubicanisotropy2_args.argptr[22] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_) + addcubicanisotropy2_args.argptr[23] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_mul) + addcubicanisotropy2_args.argptr[24] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_) + addcubicanisotropy2_args.argptr[25] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_mul) + addcubicanisotropy2_args.argptr[26] = unsafe.Pointer(&addcubicanisotropy2_args.arg_N) + } // Wrapper for addcubicanisotropy2 CUDA kernel, asynchronous. -func k_addcubicanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, k1_ unsafe.Pointer, k1_mul float32, k2_ unsafe.Pointer, k2_mul float32, k3_ unsafe.Pointer, k3_mul float32, c1x_ unsafe.Pointer, c1x_mul float32, c1y_ unsafe.Pointer, c1y_mul float32, c1z_ unsafe.Pointer, c1z_mul float32, c2x_ unsafe.Pointer, c2x_mul float32, c2y_ unsafe.Pointer, c2y_mul float32, c2z_ unsafe.Pointer, c2z_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_addcubicanisotropy2_async ( Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, k1_ unsafe.Pointer, k1_mul float32, k2_ unsafe.Pointer, k2_mul float32, k3_ unsafe.Pointer, k3_mul float32, c1x_ unsafe.Pointer, c1x_mul float32, c1y_ unsafe.Pointer, c1y_mul float32, c1z_ unsafe.Pointer, c1z_mul float32, c2x_ unsafe.Pointer, c2x_mul float32, c2y_ unsafe.Pointer, c2y_mul float32, c2z_ unsafe.Pointer, c2z_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("addcubicanisotropy2") } @@ -92,63 +92,64 @@ func k_addcubicanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe addcubicanisotropy2_args.Lock() defer addcubicanisotropy2_args.Unlock() - if addcubicanisotropy2_code == 0 { + if addcubicanisotropy2_code == 0{ addcubicanisotropy2_code = fatbinLoad(addcubicanisotropy2_map, "addcubicanisotropy2") } - addcubicanisotropy2_args.arg_Bx = Bx - addcubicanisotropy2_args.arg_By = By - addcubicanisotropy2_args.arg_Bz = Bz - addcubicanisotropy2_args.arg_mx = mx - addcubicanisotropy2_args.arg_my = my - addcubicanisotropy2_args.arg_mz = mz - addcubicanisotropy2_args.arg_Ms_ = Ms_ - addcubicanisotropy2_args.arg_Ms_mul = Ms_mul - addcubicanisotropy2_args.arg_k1_ = k1_ - addcubicanisotropy2_args.arg_k1_mul = k1_mul - addcubicanisotropy2_args.arg_k2_ = k2_ - addcubicanisotropy2_args.arg_k2_mul = k2_mul - addcubicanisotropy2_args.arg_k3_ = k3_ - addcubicanisotropy2_args.arg_k3_mul = k3_mul - addcubicanisotropy2_args.arg_c1x_ = c1x_ - addcubicanisotropy2_args.arg_c1x_mul = c1x_mul - addcubicanisotropy2_args.arg_c1y_ = c1y_ - addcubicanisotropy2_args.arg_c1y_mul = c1y_mul - addcubicanisotropy2_args.arg_c1z_ = c1z_ - addcubicanisotropy2_args.arg_c1z_mul = c1z_mul - addcubicanisotropy2_args.arg_c2x_ = c2x_ - addcubicanisotropy2_args.arg_c2x_mul = c2x_mul - addcubicanisotropy2_args.arg_c2y_ = c2y_ - addcubicanisotropy2_args.arg_c2y_mul = c2y_mul - addcubicanisotropy2_args.arg_c2z_ = c2z_ - addcubicanisotropy2_args.arg_c2z_mul = c2z_mul - addcubicanisotropy2_args.arg_N = N + addcubicanisotropy2_args.arg_Bx = Bx + addcubicanisotropy2_args.arg_By = By + addcubicanisotropy2_args.arg_Bz = Bz + addcubicanisotropy2_args.arg_mx = mx + addcubicanisotropy2_args.arg_my = my + addcubicanisotropy2_args.arg_mz = mz + addcubicanisotropy2_args.arg_Ms_ = Ms_ + addcubicanisotropy2_args.arg_Ms_mul = Ms_mul + addcubicanisotropy2_args.arg_k1_ = k1_ + addcubicanisotropy2_args.arg_k1_mul = k1_mul + addcubicanisotropy2_args.arg_k2_ = k2_ + addcubicanisotropy2_args.arg_k2_mul = k2_mul + addcubicanisotropy2_args.arg_k3_ = k3_ + addcubicanisotropy2_args.arg_k3_mul = k3_mul + addcubicanisotropy2_args.arg_c1x_ = c1x_ + addcubicanisotropy2_args.arg_c1x_mul = c1x_mul + addcubicanisotropy2_args.arg_c1y_ = c1y_ + addcubicanisotropy2_args.arg_c1y_mul = c1y_mul + addcubicanisotropy2_args.arg_c1z_ = c1z_ + addcubicanisotropy2_args.arg_c1z_mul = c1z_mul + addcubicanisotropy2_args.arg_c2x_ = c2x_ + addcubicanisotropy2_args.arg_c2x_mul = c2x_mul + addcubicanisotropy2_args.arg_c2y_ = c2y_ + addcubicanisotropy2_args.arg_c2y_mul = c2y_mul + addcubicanisotropy2_args.arg_c2z_ = c2z_ + addcubicanisotropy2_args.arg_c2z_mul = c2z_mul + addcubicanisotropy2_args.arg_N = N + args := addcubicanisotropy2_args.argptr[:] cu.LaunchKernel(addcubicanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("addcubicanisotropy2") } } // maps compute capability on PTX code for addcubicanisotropy2 kernel. -var addcubicanisotropy2_map = map[int]string{0: "", - 30: addcubicanisotropy2_ptx_30, - 35: addcubicanisotropy2_ptx_35, - 37: addcubicanisotropy2_ptx_37, - 50: addcubicanisotropy2_ptx_50, - 52: addcubicanisotropy2_ptx_52, - 53: addcubicanisotropy2_ptx_53, - 60: addcubicanisotropy2_ptx_60, - 61: addcubicanisotropy2_ptx_61, - 70: addcubicanisotropy2_ptx_70, - 75: addcubicanisotropy2_ptx_75} +var addcubicanisotropy2_map = map[int]string{ 0: "" , +30: addcubicanisotropy2_ptx_30 , +35: addcubicanisotropy2_ptx_35 , +37: addcubicanisotropy2_ptx_37 , +50: addcubicanisotropy2_ptx_50 , +52: addcubicanisotropy2_ptx_52 , +53: addcubicanisotropy2_ptx_53 , +60: addcubicanisotropy2_ptx_60 , +61: addcubicanisotropy2_ptx_61 , +70: addcubicanisotropy2_ptx_70 , +75: addcubicanisotropy2_ptx_75 } // addcubicanisotropy2 PTX code for various compute capabilities. -const ( - addcubicanisotropy2_ptx_30 = ` +const( + addcubicanisotropy2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -496,7 +497,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_35 = ` + addcubicanisotropy2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -844,7 +845,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_37 = ` + addcubicanisotropy2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1192,7 +1193,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_50 = ` + addcubicanisotropy2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1540,7 +1541,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_52 = ` + addcubicanisotropy2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1888,7 +1889,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_53 = ` + addcubicanisotropy2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -2236,7 +2237,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_60 = ` + addcubicanisotropy2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -2584,7 +2585,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_61 = ` + addcubicanisotropy2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -2932,7 +2933,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_70 = ` + addcubicanisotropy2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -3280,7 +3281,7 @@ BB0_28: ` - addcubicanisotropy2_ptx_75 = ` + addcubicanisotropy2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -3628,4 +3629,4 @@ BB0_28: ` -) + ) diff --git a/cuda/div_wrapper.go b/cuda/div_wrapper.go index 4b94cece0..9abfa7439 100644 --- a/cuda/div_wrapper.go +++ b/cuda/div_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for pointwise_div kernel var pointwise_div_code cu.Function // Stores the arguments for pointwise_div kernel invocation -type pointwise_div_args_t struct { - arg_dst unsafe.Pointer - arg_a unsafe.Pointer - arg_b unsafe.Pointer - arg_N int - argptr [4]unsafe.Pointer +type pointwise_div_args_t struct{ + arg_dst unsafe.Pointer + arg_a unsafe.Pointer + arg_b unsafe.Pointer + arg_N int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for pointwise_div kernel invocation var pointwise_div_args pointwise_div_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - pointwise_div_args.argptr[0] = unsafe.Pointer(&pointwise_div_args.arg_dst) - pointwise_div_args.argptr[1] = unsafe.Pointer(&pointwise_div_args.arg_a) - pointwise_div_args.argptr[2] = unsafe.Pointer(&pointwise_div_args.arg_b) - pointwise_div_args.argptr[3] = unsafe.Pointer(&pointwise_div_args.arg_N) -} + pointwise_div_args.argptr[0] = unsafe.Pointer(&pointwise_div_args.arg_dst) + pointwise_div_args.argptr[1] = unsafe.Pointer(&pointwise_div_args.arg_a) + pointwise_div_args.argptr[2] = unsafe.Pointer(&pointwise_div_args.arg_b) + pointwise_div_args.argptr[3] = unsafe.Pointer(&pointwise_div_args.arg_N) + } // Wrapper for pointwise_div CUDA kernel, asynchronous. -func k_pointwise_div_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_pointwise_div_async ( dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("pointwise_div") } @@ -46,40 +46,41 @@ func k_pointwise_div_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointe pointwise_div_args.Lock() defer pointwise_div_args.Unlock() - if pointwise_div_code == 0 { + if pointwise_div_code == 0{ pointwise_div_code = fatbinLoad(pointwise_div_map, "pointwise_div") } - pointwise_div_args.arg_dst = dst - pointwise_div_args.arg_a = a - pointwise_div_args.arg_b = b - pointwise_div_args.arg_N = N + pointwise_div_args.arg_dst = dst + pointwise_div_args.arg_a = a + pointwise_div_args.arg_b = b + pointwise_div_args.arg_N = N + args := pointwise_div_args.argptr[:] cu.LaunchKernel(pointwise_div_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("pointwise_div") } } // maps compute capability on PTX code for pointwise_div kernel. -var pointwise_div_map = map[int]string{0: "", - 30: pointwise_div_ptx_30, - 35: pointwise_div_ptx_35, - 37: pointwise_div_ptx_37, - 50: pointwise_div_ptx_50, - 52: pointwise_div_ptx_52, - 53: pointwise_div_ptx_53, - 60: pointwise_div_ptx_60, - 61: pointwise_div_ptx_61, - 70: pointwise_div_ptx_70, - 75: pointwise_div_ptx_75} +var pointwise_div_map = map[int]string{ 0: "" , +30: pointwise_div_ptx_30 , +35: pointwise_div_ptx_35 , +37: pointwise_div_ptx_37 , +50: pointwise_div_ptx_50 , +52: pointwise_div_ptx_52 , +53: pointwise_div_ptx_53 , +60: pointwise_div_ptx_60 , +61: pointwise_div_ptx_61 , +70: pointwise_div_ptx_70 , +75: pointwise_div_ptx_75 } // pointwise_div PTX code for various compute capabilities. -const ( - pointwise_div_ptx_30 = ` +const( + pointwise_div_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -141,7 +142,7 @@ BB0_4: ` - pointwise_div_ptx_35 = ` + pointwise_div_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -203,7 +204,7 @@ BB0_4: ` - pointwise_div_ptx_37 = ` + pointwise_div_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -265,7 +266,7 @@ BB0_4: ` - pointwise_div_ptx_50 = ` + pointwise_div_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -327,7 +328,7 @@ BB0_4: ` - pointwise_div_ptx_52 = ` + pointwise_div_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -389,7 +390,7 @@ BB0_4: ` - pointwise_div_ptx_53 = ` + pointwise_div_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -451,7 +452,7 @@ BB0_4: ` - pointwise_div_ptx_60 = ` + pointwise_div_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -513,7 +514,7 @@ BB0_4: ` - pointwise_div_ptx_61 = ` + pointwise_div_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -575,7 +576,7 @@ BB0_4: ` - pointwise_div_ptx_70 = ` + pointwise_div_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -637,7 +638,7 @@ BB0_4: ` - pointwise_div_ptx_75 = ` + pointwise_div_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -699,4 +700,4 @@ BB0_4: ` -) + ) diff --git a/cuda/dmi_wrapper.go b/cuda/dmi_wrapper.go index 8869d88f7..b30d25433 100644 --- a/cuda/dmi_wrapper.go +++ b/cuda/dmi_wrapper.go @@ -5,70 +5,70 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for adddmi kernel var adddmi_code cu.Function // Stores the arguments for adddmi kernel invocation -type adddmi_args_t struct { - arg_Hx unsafe.Pointer - arg_Hy unsafe.Pointer - arg_Hz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_aLUT2d unsafe.Pointer - arg_dLUT2d unsafe.Pointer - arg_regions unsafe.Pointer - arg_cx float32 - arg_cy float32 - arg_cz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - arg_OpenBC byte - argptr [19]unsafe.Pointer +type adddmi_args_t struct{ + arg_Hx unsafe.Pointer + arg_Hy unsafe.Pointer + arg_Hz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_aLUT2d unsafe.Pointer + arg_dLUT2d unsafe.Pointer + arg_regions unsafe.Pointer + arg_cx float32 + arg_cy float32 + arg_cz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + arg_OpenBC byte + argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmi kernel invocation var adddmi_args adddmi_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - adddmi_args.argptr[0] = unsafe.Pointer(&adddmi_args.arg_Hx) - adddmi_args.argptr[1] = unsafe.Pointer(&adddmi_args.arg_Hy) - adddmi_args.argptr[2] = unsafe.Pointer(&adddmi_args.arg_Hz) - adddmi_args.argptr[3] = unsafe.Pointer(&adddmi_args.arg_mx) - adddmi_args.argptr[4] = unsafe.Pointer(&adddmi_args.arg_my) - adddmi_args.argptr[5] = unsafe.Pointer(&adddmi_args.arg_mz) - adddmi_args.argptr[6] = unsafe.Pointer(&adddmi_args.arg_Ms_) - adddmi_args.argptr[7] = unsafe.Pointer(&adddmi_args.arg_Ms_mul) - adddmi_args.argptr[8] = unsafe.Pointer(&adddmi_args.arg_aLUT2d) - adddmi_args.argptr[9] = unsafe.Pointer(&adddmi_args.arg_dLUT2d) - adddmi_args.argptr[10] = unsafe.Pointer(&adddmi_args.arg_regions) - adddmi_args.argptr[11] = unsafe.Pointer(&adddmi_args.arg_cx) - adddmi_args.argptr[12] = unsafe.Pointer(&adddmi_args.arg_cy) - adddmi_args.argptr[13] = unsafe.Pointer(&adddmi_args.arg_cz) - adddmi_args.argptr[14] = unsafe.Pointer(&adddmi_args.arg_Nx) - adddmi_args.argptr[15] = unsafe.Pointer(&adddmi_args.arg_Ny) - adddmi_args.argptr[16] = unsafe.Pointer(&adddmi_args.arg_Nz) - adddmi_args.argptr[17] = unsafe.Pointer(&adddmi_args.arg_PBC) - adddmi_args.argptr[18] = unsafe.Pointer(&adddmi_args.arg_OpenBC) -} + adddmi_args.argptr[0] = unsafe.Pointer(&adddmi_args.arg_Hx) + adddmi_args.argptr[1] = unsafe.Pointer(&adddmi_args.arg_Hy) + adddmi_args.argptr[2] = unsafe.Pointer(&adddmi_args.arg_Hz) + adddmi_args.argptr[3] = unsafe.Pointer(&adddmi_args.arg_mx) + adddmi_args.argptr[4] = unsafe.Pointer(&adddmi_args.arg_my) + adddmi_args.argptr[5] = unsafe.Pointer(&adddmi_args.arg_mz) + adddmi_args.argptr[6] = unsafe.Pointer(&adddmi_args.arg_Ms_) + adddmi_args.argptr[7] = unsafe.Pointer(&adddmi_args.arg_Ms_mul) + adddmi_args.argptr[8] = unsafe.Pointer(&adddmi_args.arg_aLUT2d) + adddmi_args.argptr[9] = unsafe.Pointer(&adddmi_args.arg_dLUT2d) + adddmi_args.argptr[10] = unsafe.Pointer(&adddmi_args.arg_regions) + adddmi_args.argptr[11] = unsafe.Pointer(&adddmi_args.arg_cx) + adddmi_args.argptr[12] = unsafe.Pointer(&adddmi_args.arg_cy) + adddmi_args.argptr[13] = unsafe.Pointer(&adddmi_args.arg_cz) + adddmi_args.argptr[14] = unsafe.Pointer(&adddmi_args.arg_Nx) + adddmi_args.argptr[15] = unsafe.Pointer(&adddmi_args.arg_Ny) + adddmi_args.argptr[16] = unsafe.Pointer(&adddmi_args.arg_Nz) + adddmi_args.argptr[17] = unsafe.Pointer(&adddmi_args.arg_PBC) + adddmi_args.argptr[18] = unsafe.Pointer(&adddmi_args.arg_OpenBC) + } // Wrapper for adddmi CUDA kernel, asynchronous. -func k_adddmi_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, dLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { - if Synchronous { // debug +func k_adddmi_async ( Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, dLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("adddmi") } @@ -76,54 +76,56 @@ func k_adddmi_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx adddmi_args.Lock() defer adddmi_args.Unlock() - if adddmi_code == 0 { + if adddmi_code == 0{ adddmi_code = fatbinLoad(adddmi_map, "adddmi") } - adddmi_args.arg_Hx = Hx - adddmi_args.arg_Hy = Hy - adddmi_args.arg_Hz = Hz - adddmi_args.arg_mx = mx - adddmi_args.arg_my = my - adddmi_args.arg_mz = mz - adddmi_args.arg_Ms_ = Ms_ - adddmi_args.arg_Ms_mul = Ms_mul - adddmi_args.arg_aLUT2d = aLUT2d - adddmi_args.arg_dLUT2d = dLUT2d - adddmi_args.arg_regions = regions - adddmi_args.arg_cx = cx - adddmi_args.arg_cy = cy - adddmi_args.arg_cz = cz - adddmi_args.arg_Nx = Nx - adddmi_args.arg_Ny = Ny - adddmi_args.arg_Nz = Nz - adddmi_args.arg_PBC = PBC - adddmi_args.arg_OpenBC = OpenBC + adddmi_args.arg_Hx = Hx + adddmi_args.arg_Hy = Hy + adddmi_args.arg_Hz = Hz + adddmi_args.arg_mx = mx + adddmi_args.arg_my = my + adddmi_args.arg_mz = mz + adddmi_args.arg_Ms_ = Ms_ + adddmi_args.arg_Ms_mul = Ms_mul + adddmi_args.arg_aLUT2d = aLUT2d + adddmi_args.arg_dLUT2d = dLUT2d + adddmi_args.arg_regions = regions + adddmi_args.arg_cx = cx + adddmi_args.arg_cy = cy + adddmi_args.arg_cz = cz + adddmi_args.arg_Nx = Nx + adddmi_args.arg_Ny = Ny + adddmi_args.arg_Nz = Nz + adddmi_args.arg_PBC = PBC + adddmi_args.arg_OpenBC = OpenBC + args := adddmi_args.argptr[:] cu.LaunchKernel(adddmi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("adddmi") } } // maps compute capability on PTX code for adddmi kernel. -var adddmi_map = map[int]string{0: "", - 30: adddmi_ptx_30, - 35: adddmi_ptx_35, - 37: adddmi_ptx_37, - 50: adddmi_ptx_50, - 52: adddmi_ptx_52, - 53: adddmi_ptx_53, - 60: adddmi_ptx_60, - 61: adddmi_ptx_61, - 70: adddmi_ptx_70} +var adddmi_map = map[int]string{ 0: "" , +30: adddmi_ptx_30 , +35: adddmi_ptx_35 , +37: adddmi_ptx_37 , +50: adddmi_ptx_50 , +52: adddmi_ptx_52 , +53: adddmi_ptx_53 , +60: adddmi_ptx_60 , +61: adddmi_ptx_61 , +70: adddmi_ptx_70 , +75: adddmi_ptx_75 } // adddmi PTX code for various compute capabilities. -const ( - adddmi_ptx_30 = ` +const( + adddmi_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -745,7 +747,7 @@ BB0_59: ` - adddmi_ptx_35 = ` + adddmi_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1351,7 +1353,7 @@ BB0_59: ` - adddmi_ptx_37 = ` + adddmi_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1957,7 +1959,7 @@ BB0_59: ` - adddmi_ptx_50 = ` + adddmi_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -2563,7 +2565,7 @@ BB0_59: ` - adddmi_ptx_52 = ` + adddmi_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -3169,7 +3171,7 @@ BB0_59: ` - adddmi_ptx_53 = ` + adddmi_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -3775,7 +3777,7 @@ BB0_59: ` - adddmi_ptx_60 = ` + adddmi_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -4381,7 +4383,7 @@ BB0_59: ` - adddmi_ptx_61 = ` + adddmi_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -4987,7 +4989,7 @@ BB0_59: ` - adddmi_ptx_70 = ` + adddmi_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -5593,4 +5595,610 @@ BB0_59: ` + adddmi_ptx_75 = ` +.version 6.3 +.target sm_75 +.address_size 64 + + // .globl adddmi + +.visible .entry adddmi( + .param .u64 adddmi_param_0, + .param .u64 adddmi_param_1, + .param .u64 adddmi_param_2, + .param .u64 adddmi_param_3, + .param .u64 adddmi_param_4, + .param .u64 adddmi_param_5, + .param .u64 adddmi_param_6, + .param .f32 adddmi_param_7, + .param .u64 adddmi_param_8, + .param .u64 adddmi_param_9, + .param .u64 adddmi_param_10, + .param .f32 adddmi_param_11, + .param .f32 adddmi_param_12, + .param .f32 adddmi_param_13, + .param .u32 adddmi_param_14, + .param .u32 adddmi_param_15, + .param .u32 adddmi_param_16, + .param .u8 adddmi_param_17, + .param .u8 adddmi_param_18 ) +{ + .reg .pred %p<56>; + .reg .b16 %rs<37>; + .reg .f32 %f<263>; + .reg .b32 %r<128>; + .reg .b64 %rd<85>; + + + ld.param.u64 %rd7, [adddmi_param_0]; + ld.param.u64 %rd8, [adddmi_param_1]; + ld.param.u64 %rd9, [adddmi_param_2]; + ld.param.u64 %rd11, [adddmi_param_3]; + ld.param.u64 %rd12, [adddmi_param_4]; + ld.param.u64 %rd13, [adddmi_param_5]; + ld.param.u64 %rd10, [adddmi_param_6]; + ld.param.f32 %f261, [adddmi_param_7]; + ld.param.u64 %rd14, [adddmi_param_8]; + ld.param.u64 %rd15, [adddmi_param_9]; + ld.param.u64 %rd16, [adddmi_param_10]; + ld.param.f32 %f99, [adddmi_param_11]; + ld.param.f32 %f100, [adddmi_param_12]; + ld.param.f32 %f101, [adddmi_param_13]; + ld.param.u32 %r36, [adddmi_param_14]; + ld.param.u32 %r37, [adddmi_param_15]; + ld.param.u32 %r38, [adddmi_param_16]; + ld.param.u8 %rs14, [adddmi_param_18]; + ld.param.u8 %rs13, [adddmi_param_17]; + cvta.to.global.u64 %rd1, %rd15; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd16; + cvta.to.global.u64 %rd4, %rd13; + cvta.to.global.u64 %rd5, %rd12; + cvta.to.global.u64 %rd6, %rd11; + mov.u32 %r39, %ntid.x; + mov.u32 %r40, %ctaid.x; + mov.u32 %r41, %tid.x; + mad.lo.s32 %r1, %r39, %r40, %r41; + mov.u32 %r42, %ntid.y; + mov.u32 %r43, %ctaid.y; + mov.u32 %r44, %tid.y; + mad.lo.s32 %r2, %r42, %r43, %r44; + mov.u32 %r45, %ntid.z; + mov.u32 %r46, %ctaid.z; + mov.u32 %r47, %tid.z; + mad.lo.s32 %r3, %r45, %r46, %r47; + setp.ge.s32 %p1, %r2, %r37; + setp.ge.s32 %p2, %r1, %r36; + or.pred %p3, %p1, %p2; + setp.ge.s32 %p4, %r3, %r38; + or.pred %p5, %p3, %p4; + @%p5 bra BB0_59; + + mul.lo.s32 %r4, %r3, %r37; + add.s32 %r48, %r4, %r2; + mul.lo.s32 %r5, %r48, %r36; + add.s32 %r6, %r5, %r1; + mul.wide.s32 %rd17, %r6, 4; + add.s64 %rd18, %rd6, %rd17; + cvt.s64.s32 %rd19, %r6; + add.s64 %rd20, %rd5, %rd17; + add.s64 %rd21, %rd4, %rd17; + add.s64 %rd22, %rd3, %rd19; + ld.global.nc.u8 %rs1, [%rd22]; + cvt.u32.u16 %r49, %rs1; + and.b32 %r7, %r49, 255; + ld.global.nc.f32 %f1, [%rd18]; + ld.global.nc.f32 %f2, [%rd20]; + mul.f32 %f102, %f2, %f2; + fma.rn.f32 %f103, %f1, %f1, %f102; + ld.global.nc.f32 %f3, [%rd21]; + fma.rn.f32 %f104, %f3, %f3, %f103; + setp.eq.f32 %p6, %f104, 0f00000000; + @%p6 bra BB0_59; + + and.b16 %rs2, %rs13, 1; + setp.eq.s16 %p7, %rs2, 0; + add.s32 %r8, %r1, -1; + @%p7 bra BB0_4; + + rem.s32 %r50, %r8, %r36; + add.s32 %r51, %r50, %r36; + rem.s32 %r122, %r51, %r36; + bra.uni BB0_5; + +BB0_4: + mov.u32 %r52, 0; + max.s32 %r122, %r8, %r52; + +BB0_5: + add.s32 %r12, %r122, %r5; + setp.lt.s32 %p9, %r8, 0; + mov.f32 %f225, 0f00000000; + and.pred %p10, %p9, %p7; + mov.f32 %f226, %f225; + mov.f32 %f227, %f225; + @%p10 bra BB0_7; + + mul.wide.s32 %rd23, %r12, 4; + add.s64 %rd24, %rd6, %rd23; + ld.global.nc.f32 %f225, [%rd24]; + add.s64 %rd25, %rd5, %rd23; + ld.global.nc.f32 %f226, [%rd25]; + add.s64 %rd26, %rd4, %rd23; + ld.global.nc.f32 %f227, [%rd26]; + +BB0_7: + mul.f32 %f108, %f226, %f226; + fma.rn.f32 %f109, %f225, %f225, %f108; + fma.rn.f32 %f10, %f227, %f227, %f109; + setp.eq.f32 %p11, %f10, 0f00000000; + mov.u16 %rs33, %rs1; + @%p11 bra BB0_9; + + cvt.s64.s32 %rd27, %r12; + add.s64 %rd28, %rd3, %rd27; + ld.global.nc.u8 %rs33, [%rd28]; + +BB0_9: + setp.gt.u16 %p12, %rs33, %rs1; + cvt.u32.u16 %r53, %rs33; + and.b32 %r54, %r53, 255; + selp.b32 %r55, %r7, %r54, %p12; + selp.b32 %r56, %r54, %r7, %p12; + add.s32 %r57, %r56, 1; + mul.lo.s32 %r58, %r57, %r56; + shr.u32 %r59, %r58, 1; + add.s32 %r60, %r59, %r55; + mul.wide.s32 %rd29, %r60, 4; + add.s64 %rd30, %rd2, %rd29; + ld.global.nc.f32 %f11, [%rd30]; + add.s64 %rd31, %rd1, %rd29; + ld.global.nc.f32 %f12, [%rd31]; + setp.ne.s16 %p13, %rs14, 0; + mov.f32 %f237, 0f00000000; + and.pred %p15, %p11, %p13; + mov.f32 %f238, %f237; + mov.f32 %f239, %f237; + @%p15 bra BB0_13; + + setp.neu.f32 %p16, %f10, 0f00000000; + @%p16 bra BB0_12; + + mul.f32 %f113, %f12, 0f3F000000; + div.rn.f32 %f114, %f113, %f11; + mul.f32 %f115, %f114, %f99; + fma.rn.f32 %f225, %f3, %f115, %f1; + mul.f32 %f116, %f1, %f115; + sub.f32 %f227, %f3, %f116; + mov.f32 %f226, %f2; + +BB0_12: + mul.f32 %f117, %f99, %f99; + add.f32 %f118, %f11, %f11; + div.rn.f32 %f119, %f118, %f117; + sub.f32 %f120, %f225, %f1; + sub.f32 %f121, %f226, %f2; + sub.f32 %f122, %f227, %f3; + fma.rn.f32 %f123, %f120, %f119, 0f00000000; + fma.rn.f32 %f238, %f121, %f119, 0f00000000; + fma.rn.f32 %f124, %f119, %f122, 0f00000000; + div.rn.f32 %f125, %f12, %f99; + mul.f32 %f126, %f227, %f125; + sub.f32 %f237, %f123, %f126; + fma.rn.f32 %f239, %f225, %f125, %f124; + +BB0_13: + add.s32 %r13, %r1, 1; + @%p7 bra BB0_15; + + rem.s32 %r61, %r13, %r36; + add.s32 %r62, %r61, %r36; + rem.s32 %r123, %r62, %r36; + bra.uni BB0_16; + +BB0_15: + add.s32 %r63, %r36, -1; + min.s32 %r123, %r13, %r63; + +BB0_16: + add.s32 %r17, %r123, %r5; + setp.ge.s32 %p18, %r13, %r36; + mov.f32 %f231, 0f00000000; + and.pred %p20, %p18, %p7; + mov.f32 %f232, %f231; + mov.f32 %f233, %f231; + @%p20 bra BB0_18; + + mul.wide.s32 %rd32, %r17, 4; + add.s64 %rd33, %rd6, %rd32; + ld.global.nc.f32 %f231, [%rd33]; + add.s64 %rd34, %rd5, %rd32; + ld.global.nc.f32 %f232, [%rd34]; + add.s64 %rd35, %rd4, %rd32; + ld.global.nc.f32 %f233, [%rd35]; + +BB0_18: + mul.f32 %f130, %f232, %f232; + fma.rn.f32 %f131, %f231, %f231, %f130; + fma.rn.f32 %f30, %f233, %f233, %f131; + setp.eq.f32 %p21, %f30, 0f00000000; + mov.u16 %rs34, %rs1; + @%p21 bra BB0_20; + + cvt.s64.s32 %rd36, %r17; + add.s64 %rd37, %rd3, %rd36; + ld.global.nc.u8 %rs34, [%rd37]; + +BB0_20: + setp.gt.u16 %p22, %rs34, %rs1; + cvt.u32.u16 %r64, %rs34; + and.b32 %r65, %r64, 255; + selp.b32 %r66, %r7, %r65, %p22; + selp.b32 %r67, %r65, %r7, %p22; + add.s32 %r68, %r67, 1; + mul.lo.s32 %r69, %r68, %r67; + shr.u32 %r70, %r69, 1; + add.s32 %r71, %r70, %r66; + mul.wide.s32 %rd38, %r71, 4; + add.s64 %rd39, %rd2, %rd38; + ld.global.nc.f32 %f31, [%rd39]; + add.s64 %rd40, %rd1, %rd38; + ld.global.nc.f32 %f32, [%rd40]; + and.pred %p25, %p21, %p13; + @%p25 bra BB0_24; + + setp.neu.f32 %p26, %f30, 0f00000000; + @%p26 bra BB0_23; + + mul.f32 %f132, %f32, 0f3F000000; + div.rn.f32 %f133, %f132, %f31; + mul.f32 %f134, %f133, %f99; + mul.f32 %f135, %f3, %f134; + sub.f32 %f231, %f1, %f135; + fma.rn.f32 %f233, %f1, %f134, %f3; + mov.f32 %f232, %f2; + +BB0_23: + mul.f32 %f136, %f99, %f99; + add.f32 %f137, %f31, %f31; + div.rn.f32 %f138, %f137, %f136; + sub.f32 %f139, %f231, %f1; + sub.f32 %f140, %f232, %f2; + sub.f32 %f141, %f233, %f3; + fma.rn.f32 %f142, %f139, %f138, %f237; + fma.rn.f32 %f238, %f140, %f138, %f238; + fma.rn.f32 %f143, %f138, %f141, %f239; + div.rn.f32 %f144, %f32, %f99; + fma.rn.f32 %f237, %f233, %f144, %f142; + mul.f32 %f145, %f231, %f144; + sub.f32 %f239, %f143, %f145; + +BB0_24: + and.b16 %rs7, %rs13, 2; + setp.eq.s16 %p27, %rs7, 0; + add.s32 %r18, %r2, -1; + @%p27 bra BB0_26; + + rem.s32 %r72, %r18, %r37; + add.s32 %r73, %r72, %r37; + rem.s32 %r124, %r73, %r37; + bra.uni BB0_27; + +BB0_26: + mov.u32 %r74, 0; + max.s32 %r124, %r18, %r74; + +BB0_27: + add.s32 %r75, %r124, %r4; + mad.lo.s32 %r22, %r75, %r36, %r1; + setp.lt.s32 %p29, %r18, 0; + mov.f32 %f240, 0f00000000; + and.pred %p30, %p29, %p27; + mov.f32 %f241, %f240; + mov.f32 %f242, %f240; + @%p30 bra BB0_29; + + mul.wide.s32 %rd41, %r22, 4; + add.s64 %rd42, %rd6, %rd41; + ld.global.nc.f32 %f240, [%rd42]; + add.s64 %rd43, %rd5, %rd41; + ld.global.nc.f32 %f241, [%rd43]; + add.s64 %rd44, %rd4, %rd41; + ld.global.nc.f32 %f242, [%rd44]; + +BB0_29: + mul.f32 %f149, %f241, %f241; + fma.rn.f32 %f150, %f240, %f240, %f149; + fma.rn.f32 %f50, %f242, %f242, %f150; + setp.eq.f32 %p31, %f50, 0f00000000; + mov.u16 %rs35, %rs1; + @%p31 bra BB0_31; + + cvt.s64.s32 %rd45, %r22; + add.s64 %rd46, %rd3, %rd45; + ld.global.nc.u8 %rs35, [%rd46]; + +BB0_31: + setp.gt.u16 %p32, %rs35, %rs1; + cvt.u32.u16 %r76, %rs35; + and.b32 %r77, %r76, 255; + selp.b32 %r78, %r7, %r77, %p32; + selp.b32 %r79, %r77, %r7, %p32; + add.s32 %r80, %r79, 1; + mul.lo.s32 %r81, %r80, %r79; + shr.u32 %r82, %r81, 1; + add.s32 %r83, %r82, %r78; + mul.wide.s32 %rd47, %r83, 4; + add.s64 %rd48, %rd2, %rd47; + ld.global.nc.f32 %f51, [%rd48]; + add.s64 %rd49, %rd1, %rd47; + ld.global.nc.f32 %f52, [%rd49]; + and.pred %p35, %p31, %p13; + @%p35 bra BB0_35; + + setp.neu.f32 %p36, %f50, 0f00000000; + @%p36 bra BB0_34; + + mul.f32 %f151, %f52, 0f3F000000; + div.rn.f32 %f152, %f151, %f51; + mul.f32 %f153, %f152, %f100; + fma.rn.f32 %f241, %f3, %f153, %f2; + mul.f32 %f154, %f2, %f153; + sub.f32 %f242, %f3, %f154; + mov.f32 %f240, %f1; + +BB0_34: + mul.f32 %f155, %f100, %f100; + add.f32 %f156, %f51, %f51; + div.rn.f32 %f157, %f156, %f155; + sub.f32 %f158, %f240, %f1; + sub.f32 %f159, %f241, %f2; + sub.f32 %f160, %f242, %f3; + fma.rn.f32 %f237, %f158, %f157, %f237; + fma.rn.f32 %f161, %f159, %f157, %f238; + fma.rn.f32 %f162, %f157, %f160, %f239; + div.rn.f32 %f163, %f52, %f100; + mul.f32 %f164, %f242, %f163; + sub.f32 %f238, %f161, %f164; + fma.rn.f32 %f239, %f241, %f163, %f162; + +BB0_35: + add.s32 %r23, %r2, 1; + @%p27 bra BB0_37; + + rem.s32 %r84, %r23, %r37; + add.s32 %r85, %r84, %r37; + rem.s32 %r125, %r85, %r37; + bra.uni BB0_38; + +BB0_37: + add.s32 %r86, %r37, -1; + min.s32 %r125, %r23, %r86; + +BB0_38: + add.s32 %r87, %r125, %r4; + mad.lo.s32 %r27, %r87, %r36, %r1; + setp.ge.s32 %p38, %r23, %r37; + mov.f32 %f249, 0f00000000; + and.pred %p40, %p38, %p27; + mov.f32 %f250, %f249; + mov.f32 %f251, %f249; + @%p40 bra BB0_40; + + mul.wide.s32 %rd50, %r27, 4; + add.s64 %rd51, %rd6, %rd50; + ld.global.nc.f32 %f249, [%rd51]; + add.s64 %rd52, %rd5, %rd50; + ld.global.nc.f32 %f250, [%rd52]; + add.s64 %rd53, %rd4, %rd50; + ld.global.nc.f32 %f251, [%rd53]; + +BB0_40: + mul.f32 %f168, %f250, %f250; + fma.rn.f32 %f169, %f249, %f249, %f168; + fma.rn.f32 %f70, %f251, %f251, %f169; + setp.eq.f32 %p41, %f70, 0f00000000; + mov.u16 %rs36, %rs1; + @%p41 bra BB0_42; + + cvt.s64.s32 %rd54, %r27; + add.s64 %rd55, %rd3, %rd54; + ld.global.nc.u8 %rs36, [%rd55]; + +BB0_42: + setp.gt.u16 %p42, %rs36, %rs1; + cvt.u32.u16 %r88, %rs36; + and.b32 %r89, %r88, 255; + selp.b32 %r90, %r7, %r89, %p42; + selp.b32 %r91, %r89, %r7, %p42; + add.s32 %r92, %r91, 1; + mul.lo.s32 %r93, %r92, %r91; + shr.u32 %r94, %r93, 1; + add.s32 %r95, %r94, %r90; + mul.wide.s32 %rd56, %r95, 4; + add.s64 %rd57, %rd2, %rd56; + ld.global.nc.f32 %f71, [%rd57]; + add.s64 %rd58, %rd1, %rd56; + ld.global.nc.f32 %f72, [%rd58]; + and.pred %p45, %p41, %p13; + @%p45 bra BB0_46; + + setp.neu.f32 %p46, %f70, 0f00000000; + @%p46 bra BB0_45; + + mul.f32 %f170, %f72, 0f3F000000; + div.rn.f32 %f171, %f170, %f71; + mul.f32 %f172, %f171, %f100; + mul.f32 %f173, %f3, %f172; + sub.f32 %f250, %f2, %f173; + fma.rn.f32 %f251, %f2, %f172, %f3; + mov.f32 %f249, %f1; + +BB0_45: + mul.f32 %f174, %f100, %f100; + add.f32 %f175, %f71, %f71; + div.rn.f32 %f176, %f175, %f174; + sub.f32 %f177, %f249, %f1; + sub.f32 %f178, %f250, %f2; + sub.f32 %f179, %f251, %f3; + fma.rn.f32 %f237, %f177, %f176, %f237; + fma.rn.f32 %f180, %f178, %f176, %f238; + fma.rn.f32 %f181, %f176, %f179, %f239; + div.rn.f32 %f182, %f72, %f100; + fma.rn.f32 %f238, %f251, %f182, %f180; + mul.f32 %f183, %f250, %f182; + sub.f32 %f239, %f181, %f183; + +BB0_46: + setp.eq.s32 %p47, %r38, 1; + @%p47 bra BB0_54; + + and.b16 %rs12, %rs13, 4; + setp.eq.s16 %p48, %rs12, 0; + add.s32 %r28, %r3, -1; + @%p48 bra BB0_49; + + rem.s32 %r96, %r28, %r38; + add.s32 %r97, %r96, %r38; + rem.s32 %r126, %r97, %r38; + bra.uni BB0_50; + +BB0_49: + mov.u32 %r98, 0; + max.s32 %r126, %r28, %r98; + +BB0_50: + mad.lo.s32 %r99, %r126, %r37, %r2; + mad.lo.s32 %r100, %r99, %r36, %r1; + cvt.s64.s32 %rd59, %r100; + mul.wide.s32 %rd60, %r100, 4; + add.s64 %rd61, %rd6, %rd60; + add.s64 %rd62, %rd5, %rd60; + add.s64 %rd63, %rd4, %rd60; + ld.global.nc.f32 %f184, [%rd61]; + ld.global.nc.f32 %f185, [%rd62]; + mul.f32 %f186, %f185, %f185; + fma.rn.f32 %f187, %f184, %f184, %f186; + ld.global.nc.f32 %f188, [%rd63]; + fma.rn.f32 %f189, %f188, %f188, %f187; + setp.eq.f32 %p49, %f189, 0f00000000; + selp.f32 %f190, %f1, %f184, %p49; + selp.f32 %f191, %f2, %f185, %p49; + selp.f32 %f192, %f3, %f188, %p49; + add.s64 %rd64, %rd3, %rd59; + ld.global.nc.u8 %rs26, [%rd64]; + setp.gt.u16 %p50, %rs26, %rs1; + cvt.u32.u16 %r101, %rs26; + and.b32 %r102, %r101, 255; + selp.b32 %r103, %r7, %r102, %p50; + selp.b32 %r104, %r102, %r7, %p50; + add.s32 %r105, %r104, 1; + mul.lo.s32 %r106, %r105, %r104; + shr.u32 %r107, %r106, 1; + add.s32 %r108, %r107, %r103; + mul.wide.s32 %rd65, %r108, 4; + add.s64 %rd66, %rd2, %rd65; + ld.global.nc.f32 %f193, [%rd66]; + add.f32 %f194, %f193, %f193; + mul.f32 %f84, %f101, %f101; + div.rn.f32 %f195, %f194, %f84; + sub.f32 %f196, %f190, %f1; + sub.f32 %f197, %f191, %f2; + sub.f32 %f198, %f192, %f3; + fma.rn.f32 %f85, %f195, %f196, %f237; + fma.rn.f32 %f86, %f195, %f197, %f238; + fma.rn.f32 %f87, %f195, %f198, %f239; + add.s32 %r32, %r3, 1; + @%p48 bra BB0_52; + + rem.s32 %r109, %r32, %r38; + add.s32 %r110, %r109, %r38; + rem.s32 %r127, %r110, %r38; + bra.uni BB0_53; + +BB0_52: + add.s32 %r111, %r38, -1; + min.s32 %r127, %r32, %r111; + +BB0_53: + mad.lo.s32 %r112, %r127, %r37, %r2; + mad.lo.s32 %r113, %r112, %r36, %r1; + cvt.s64.s32 %rd67, %r113; + mul.wide.s32 %rd68, %r113, 4; + add.s64 %rd69, %rd6, %rd68; + add.s64 %rd70, %rd5, %rd68; + add.s64 %rd71, %rd4, %rd68; + ld.global.nc.f32 %f199, [%rd69]; + ld.global.nc.f32 %f200, [%rd70]; + mul.f32 %f201, %f200, %f200; + fma.rn.f32 %f202, %f199, %f199, %f201; + ld.global.nc.f32 %f203, [%rd71]; + fma.rn.f32 %f204, %f203, %f203, %f202; + setp.eq.f32 %p52, %f204, 0f00000000; + selp.f32 %f205, %f3, %f203, %p52; + selp.f32 %f206, %f2, %f200, %p52; + selp.f32 %f207, %f1, %f199, %p52; + add.s64 %rd72, %rd3, %rd67; + ld.global.nc.u8 %rs30, [%rd72]; + setp.gt.u16 %p53, %rs30, %rs1; + cvt.u32.u16 %r114, %rs30; + and.b32 %r115, %r114, 255; + selp.b32 %r116, %r7, %r115, %p53; + selp.b32 %r117, %r115, %r7, %p53; + add.s32 %r118, %r117, 1; + mul.lo.s32 %r119, %r118, %r117; + shr.u32 %r120, %r119, 1; + add.s32 %r121, %r120, %r116; + mul.wide.s32 %rd73, %r121, 4; + add.s64 %rd74, %rd2, %rd73; + ld.global.nc.f32 %f208, [%rd74]; + add.f32 %f209, %f208, %f208; + div.rn.f32 %f210, %f209, %f84; + sub.f32 %f211, %f207, %f1; + sub.f32 %f212, %f206, %f2; + sub.f32 %f213, %f205, %f3; + fma.rn.f32 %f237, %f210, %f211, %f85; + fma.rn.f32 %f238, %f210, %f212, %f86; + fma.rn.f32 %f239, %f210, %f213, %f87; + +BB0_54: + setp.eq.s64 %p54, %rd10, 0; + @%p54 bra BB0_56; + + cvta.to.global.u64 %rd75, %rd10; + add.s64 %rd77, %rd75, %rd17; + ld.global.nc.f32 %f214, [%rd77]; + mul.f32 %f261, %f214, %f261; + +BB0_56: + setp.eq.f32 %p55, %f261, 0f00000000; + mov.f32 %f262, 0f00000000; + @%p55 bra BB0_58; + + rcp.rn.f32 %f262, %f261; + +BB0_58: + cvta.to.global.u64 %rd78, %rd9; + cvta.to.global.u64 %rd79, %rd8; + cvta.to.global.u64 %rd80, %rd7; + add.s64 %rd82, %rd80, %rd17; + ld.global.f32 %f216, [%rd82]; + fma.rn.f32 %f217, %f237, %f262, %f216; + st.global.f32 [%rd82], %f217; + add.s64 %rd83, %rd79, %rd17; + ld.global.f32 %f218, [%rd83]; + fma.rn.f32 %f219, %f238, %f262, %f218; + st.global.f32 [%rd83], %f219; + add.s64 %rd84, %rd78, %rd17; + ld.global.f32 %f220, [%rd84]; + fma.rn.f32 %f221, %f239, %f262, %f220; + st.global.f32 [%rd84], %f221; + +BB0_59: + ret; +} + + +` + ) diff --git a/cuda/dmibulk_wrapper.go b/cuda/dmibulk_wrapper.go index 24a90ae6b..bbc0de019 100644 --- a/cuda/dmibulk_wrapper.go +++ b/cuda/dmibulk_wrapper.go @@ -5,70 +5,70 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for adddmibulk kernel var adddmibulk_code cu.Function // Stores the arguments for adddmibulk kernel invocation -type adddmibulk_args_t struct { - arg_Hx unsafe.Pointer - arg_Hy unsafe.Pointer - arg_Hz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_aLUT2d unsafe.Pointer - arg_DLUT2d unsafe.Pointer - arg_regions unsafe.Pointer - arg_cx float32 - arg_cy float32 - arg_cz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - arg_OpenBC byte - argptr [19]unsafe.Pointer +type adddmibulk_args_t struct{ + arg_Hx unsafe.Pointer + arg_Hy unsafe.Pointer + arg_Hz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_aLUT2d unsafe.Pointer + arg_DLUT2d unsafe.Pointer + arg_regions unsafe.Pointer + arg_cx float32 + arg_cy float32 + arg_cz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + arg_OpenBC byte + argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmibulk kernel invocation var adddmibulk_args adddmibulk_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - adddmibulk_args.argptr[0] = unsafe.Pointer(&adddmibulk_args.arg_Hx) - adddmibulk_args.argptr[1] = unsafe.Pointer(&adddmibulk_args.arg_Hy) - adddmibulk_args.argptr[2] = unsafe.Pointer(&adddmibulk_args.arg_Hz) - adddmibulk_args.argptr[3] = unsafe.Pointer(&adddmibulk_args.arg_mx) - adddmibulk_args.argptr[4] = unsafe.Pointer(&adddmibulk_args.arg_my) - adddmibulk_args.argptr[5] = unsafe.Pointer(&adddmibulk_args.arg_mz) - adddmibulk_args.argptr[6] = unsafe.Pointer(&adddmibulk_args.arg_Ms_) - adddmibulk_args.argptr[7] = unsafe.Pointer(&adddmibulk_args.arg_Ms_mul) - adddmibulk_args.argptr[8] = unsafe.Pointer(&adddmibulk_args.arg_aLUT2d) - adddmibulk_args.argptr[9] = unsafe.Pointer(&adddmibulk_args.arg_DLUT2d) - adddmibulk_args.argptr[10] = unsafe.Pointer(&adddmibulk_args.arg_regions) - adddmibulk_args.argptr[11] = unsafe.Pointer(&adddmibulk_args.arg_cx) - adddmibulk_args.argptr[12] = unsafe.Pointer(&adddmibulk_args.arg_cy) - adddmibulk_args.argptr[13] = unsafe.Pointer(&adddmibulk_args.arg_cz) - adddmibulk_args.argptr[14] = unsafe.Pointer(&adddmibulk_args.arg_Nx) - adddmibulk_args.argptr[15] = unsafe.Pointer(&adddmibulk_args.arg_Ny) - adddmibulk_args.argptr[16] = unsafe.Pointer(&adddmibulk_args.arg_Nz) - adddmibulk_args.argptr[17] = unsafe.Pointer(&adddmibulk_args.arg_PBC) - adddmibulk_args.argptr[18] = unsafe.Pointer(&adddmibulk_args.arg_OpenBC) -} + adddmibulk_args.argptr[0] = unsafe.Pointer(&adddmibulk_args.arg_Hx) + adddmibulk_args.argptr[1] = unsafe.Pointer(&adddmibulk_args.arg_Hy) + adddmibulk_args.argptr[2] = unsafe.Pointer(&adddmibulk_args.arg_Hz) + adddmibulk_args.argptr[3] = unsafe.Pointer(&adddmibulk_args.arg_mx) + adddmibulk_args.argptr[4] = unsafe.Pointer(&adddmibulk_args.arg_my) + adddmibulk_args.argptr[5] = unsafe.Pointer(&adddmibulk_args.arg_mz) + adddmibulk_args.argptr[6] = unsafe.Pointer(&adddmibulk_args.arg_Ms_) + adddmibulk_args.argptr[7] = unsafe.Pointer(&adddmibulk_args.arg_Ms_mul) + adddmibulk_args.argptr[8] = unsafe.Pointer(&adddmibulk_args.arg_aLUT2d) + adddmibulk_args.argptr[9] = unsafe.Pointer(&adddmibulk_args.arg_DLUT2d) + adddmibulk_args.argptr[10] = unsafe.Pointer(&adddmibulk_args.arg_regions) + adddmibulk_args.argptr[11] = unsafe.Pointer(&adddmibulk_args.arg_cx) + adddmibulk_args.argptr[12] = unsafe.Pointer(&adddmibulk_args.arg_cy) + adddmibulk_args.argptr[13] = unsafe.Pointer(&adddmibulk_args.arg_cz) + adddmibulk_args.argptr[14] = unsafe.Pointer(&adddmibulk_args.arg_Nx) + adddmibulk_args.argptr[15] = unsafe.Pointer(&adddmibulk_args.arg_Ny) + adddmibulk_args.argptr[16] = unsafe.Pointer(&adddmibulk_args.arg_Nz) + adddmibulk_args.argptr[17] = unsafe.Pointer(&adddmibulk_args.arg_PBC) + adddmibulk_args.argptr[18] = unsafe.Pointer(&adddmibulk_args.arg_OpenBC) + } // Wrapper for adddmibulk CUDA kernel, asynchronous. -func k_adddmibulk_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, DLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { - if Synchronous { // debug +func k_adddmibulk_async ( Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, DLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("adddmibulk") } @@ -76,54 +76,56 @@ func k_adddmibulk_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, adddmibulk_args.Lock() defer adddmibulk_args.Unlock() - if adddmibulk_code == 0 { + if adddmibulk_code == 0{ adddmibulk_code = fatbinLoad(adddmibulk_map, "adddmibulk") } - adddmibulk_args.arg_Hx = Hx - adddmibulk_args.arg_Hy = Hy - adddmibulk_args.arg_Hz = Hz - adddmibulk_args.arg_mx = mx - adddmibulk_args.arg_my = my - adddmibulk_args.arg_mz = mz - adddmibulk_args.arg_Ms_ = Ms_ - adddmibulk_args.arg_Ms_mul = Ms_mul - adddmibulk_args.arg_aLUT2d = aLUT2d - adddmibulk_args.arg_DLUT2d = DLUT2d - adddmibulk_args.arg_regions = regions - adddmibulk_args.arg_cx = cx - adddmibulk_args.arg_cy = cy - adddmibulk_args.arg_cz = cz - adddmibulk_args.arg_Nx = Nx - adddmibulk_args.arg_Ny = Ny - adddmibulk_args.arg_Nz = Nz - adddmibulk_args.arg_PBC = PBC - adddmibulk_args.arg_OpenBC = OpenBC + adddmibulk_args.arg_Hx = Hx + adddmibulk_args.arg_Hy = Hy + adddmibulk_args.arg_Hz = Hz + adddmibulk_args.arg_mx = mx + adddmibulk_args.arg_my = my + adddmibulk_args.arg_mz = mz + adddmibulk_args.arg_Ms_ = Ms_ + adddmibulk_args.arg_Ms_mul = Ms_mul + adddmibulk_args.arg_aLUT2d = aLUT2d + adddmibulk_args.arg_DLUT2d = DLUT2d + adddmibulk_args.arg_regions = regions + adddmibulk_args.arg_cx = cx + adddmibulk_args.arg_cy = cy + adddmibulk_args.arg_cz = cz + adddmibulk_args.arg_Nx = Nx + adddmibulk_args.arg_Ny = Ny + adddmibulk_args.arg_Nz = Nz + adddmibulk_args.arg_PBC = PBC + adddmibulk_args.arg_OpenBC = OpenBC + args := adddmibulk_args.argptr[:] cu.LaunchKernel(adddmibulk_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("adddmibulk") } } // maps compute capability on PTX code for adddmibulk kernel. -var adddmibulk_map = map[int]string{0: "", - 30: adddmibulk_ptx_30, - 35: adddmibulk_ptx_35, - 37: adddmibulk_ptx_37, - 50: adddmibulk_ptx_50, - 52: adddmibulk_ptx_52, - 53: adddmibulk_ptx_53, - 60: adddmibulk_ptx_60, - 61: adddmibulk_ptx_61, - 70: adddmibulk_ptx_70} +var adddmibulk_map = map[int]string{ 0: "" , +30: adddmibulk_ptx_30 , +35: adddmibulk_ptx_35 , +37: adddmibulk_ptx_37 , +50: adddmibulk_ptx_50 , +52: adddmibulk_ptx_52 , +53: adddmibulk_ptx_53 , +60: adddmibulk_ptx_60 , +61: adddmibulk_ptx_61 , +70: adddmibulk_ptx_70 , +75: adddmibulk_ptx_75 } // adddmibulk PTX code for various compute capabilities. -const ( - adddmibulk_ptx_30 = ` +const( + adddmibulk_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -785,7 +787,7 @@ BB0_62: ` - adddmibulk_ptx_35 = ` + adddmibulk_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1434,7 +1436,7 @@ BB0_62: ` - adddmibulk_ptx_37 = ` + adddmibulk_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -2083,7 +2085,7 @@ BB0_62: ` - adddmibulk_ptx_50 = ` + adddmibulk_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -2732,7 +2734,7 @@ BB0_62: ` - adddmibulk_ptx_52 = ` + adddmibulk_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -3381,7 +3383,7 @@ BB0_62: ` - adddmibulk_ptx_53 = ` + adddmibulk_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -4030,7 +4032,7 @@ BB0_62: ` - adddmibulk_ptx_60 = ` + adddmibulk_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -4679,7 +4681,7 @@ BB0_62: ` - adddmibulk_ptx_61 = ` + adddmibulk_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -5328,7 +5330,7 @@ BB0_62: ` - adddmibulk_ptx_70 = ` + adddmibulk_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -5977,4 +5979,653 @@ BB0_62: ` + adddmibulk_ptx_75 = ` +.version 6.3 +.target sm_75 +.address_size 64 + + // .globl adddmibulk + +.visible .entry adddmibulk( + .param .u64 adddmibulk_param_0, + .param .u64 adddmibulk_param_1, + .param .u64 adddmibulk_param_2, + .param .u64 adddmibulk_param_3, + .param .u64 adddmibulk_param_4, + .param .u64 adddmibulk_param_5, + .param .u64 adddmibulk_param_6, + .param .f32 adddmibulk_param_7, + .param .u64 adddmibulk_param_8, + .param .u64 adddmibulk_param_9, + .param .u64 adddmibulk_param_10, + .param .f32 adddmibulk_param_11, + .param .f32 adddmibulk_param_12, + .param .f32 adddmibulk_param_13, + .param .u32 adddmibulk_param_14, + .param .u32 adddmibulk_param_15, + .param .u32 adddmibulk_param_16, + .param .u8 adddmibulk_param_17, + .param .u8 adddmibulk_param_18 ) +{ + .reg .pred %p<70>; + .reg .b16 %rs<43>; + .reg .f32 %f<292>; + .reg .b32 %r<128>; + .reg .b64 %rd<87>; + + + ld.param.u64 %rd9, [adddmibulk_param_0]; + ld.param.u64 %rd10, [adddmibulk_param_1]; + ld.param.u64 %rd11, [adddmibulk_param_2]; + ld.param.u64 %rd13, [adddmibulk_param_3]; + ld.param.u64 %rd14, [adddmibulk_param_4]; + ld.param.u64 %rd15, [adddmibulk_param_5]; + ld.param.u64 %rd12, [adddmibulk_param_6]; + ld.param.f32 %f290, [adddmibulk_param_7]; + ld.param.u64 %rd16, [adddmibulk_param_8]; + ld.param.u64 %rd17, [adddmibulk_param_9]; + ld.param.u64 %rd18, [adddmibulk_param_10]; + ld.param.f32 %f87, [adddmibulk_param_11]; + ld.param.f32 %f88, [adddmibulk_param_12]; + ld.param.f32 %f89, [adddmibulk_param_13]; + ld.param.u32 %r43, [adddmibulk_param_14]; + ld.param.u32 %r44, [adddmibulk_param_15]; + ld.param.u32 %r45, [adddmibulk_param_16]; + ld.param.u8 %rs18, [adddmibulk_param_18]; + ld.param.u8 %rs17, [adddmibulk_param_17]; + cvta.to.global.u64 %rd1, %rd17; + cvta.to.global.u64 %rd2, %rd16; + cvta.to.global.u64 %rd3, %rd18; + cvta.to.global.u64 %rd4, %rd15; + cvta.to.global.u64 %rd5, %rd14; + cvta.to.global.u64 %rd6, %rd13; + mov.u32 %r46, %ntid.x; + mov.u32 %r47, %ctaid.x; + mov.u32 %r48, %tid.x; + mad.lo.s32 %r1, %r46, %r47, %r48; + mov.u32 %r49, %ntid.y; + mov.u32 %r50, %ctaid.y; + mov.u32 %r51, %tid.y; + mad.lo.s32 %r2, %r49, %r50, %r51; + mov.u32 %r52, %ntid.z; + mov.u32 %r53, %ctaid.z; + mov.u32 %r54, %tid.z; + mad.lo.s32 %r3, %r52, %r53, %r54; + setp.ge.s32 %p1, %r2, %r44; + setp.ge.s32 %p2, %r1, %r43; + or.pred %p3, %p1, %p2; + setp.ge.s32 %p4, %r3, %r45; + or.pred %p5, %p3, %p4; + @%p5 bra BB0_62; + + mul.lo.s32 %r4, %r3, %r44; + add.s32 %r55, %r4, %r2; + mul.lo.s32 %r5, %r55, %r43; + add.s32 %r6, %r5, %r1; + mul.wide.s32 %rd19, %r6, 4; + add.s64 %rd20, %rd6, %rd19; + cvt.s64.s32 %rd21, %r6; + add.s64 %rd22, %rd5, %rd19; + add.s64 %rd23, %rd4, %rd19; + add.s64 %rd24, %rd3, %rd21; + ld.global.nc.u8 %rs1, [%rd24]; + cvt.u32.u16 %r56, %rs1; + and.b32 %r7, %r56, 255; + ld.global.nc.f32 %f1, [%rd20]; + ld.global.nc.f32 %f2, [%rd22]; + mul.f32 %f90, %f2, %f2; + fma.rn.f32 %f91, %f1, %f1, %f90; + ld.global.nc.f32 %f3, [%rd23]; + fma.rn.f32 %f92, %f3, %f3, %f91; + setp.eq.f32 %p6, %f92, 0f00000000; + @%p6 bra BB0_62; + + and.b16 %rs2, %rs17, 1; + setp.eq.s16 %p7, %rs2, 0; + add.s32 %r8, %r1, -1; + @%p7 bra BB0_4; + + rem.s32 %r57, %r8, %r43; + add.s32 %r58, %r57, %r43; + rem.s32 %r122, %r58, %r43; + bra.uni BB0_5; + +BB0_4: + mov.u32 %r59, 0; + max.s32 %r122, %r8, %r59; + +BB0_5: + add.s32 %r12, %r122, %r5; + setp.lt.s32 %p9, %r8, 0; + mov.f32 %f254, 0f00000000; + and.pred %p10, %p9, %p7; + mov.f32 %f255, %f254; + mov.f32 %f256, %f254; + @%p10 bra BB0_7; + + mul.wide.s32 %rd25, %r12, 4; + add.s64 %rd26, %rd6, %rd25; + ld.global.nc.f32 %f254, [%rd26]; + add.s64 %rd27, %rd5, %rd25; + ld.global.nc.f32 %f255, [%rd27]; + add.s64 %rd28, %rd4, %rd25; + ld.global.nc.f32 %f256, [%rd28]; + +BB0_7: + mul.f32 %f96, %f255, %f255; + fma.rn.f32 %f97, %f254, %f254, %f96; + fma.rn.f32 %f10, %f256, %f256, %f97; + setp.eq.f32 %p11, %f10, 0f00000000; + mov.u16 %rs37, %rs1; + @%p11 bra BB0_9; + + cvt.s64.s32 %rd29, %r12; + add.s64 %rd30, %rd3, %rd29; + ld.global.nc.u8 %rs37, [%rd30]; + +BB0_9: + setp.gt.u16 %p12, %rs37, %rs1; + cvt.u32.u16 %r60, %rs37; + and.b32 %r61, %r60, 255; + selp.b32 %r62, %r7, %r61, %p12; + selp.b32 %r63, %r61, %r7, %p12; + add.s32 %r64, %r63, 1; + mul.lo.s32 %r65, %r64, %r63; + shr.u32 %r66, %r65, 1; + add.s32 %r13, %r66, %r62; + setp.ne.s16 %p13, %rs18, 0; + mov.f32 %f263, 0f00000000; + and.pred %p15, %p11, %p13; + mov.f32 %f264, %f263; + mov.f32 %f265, %f263; + @%p15 bra BB0_11; + + mul.wide.s32 %rd31, %r13, 4; + add.s64 %rd32, %rd2, %rd31; + ld.global.nc.f32 %f101, [%rd32]; + add.f32 %f102, %f101, %f101; + add.s64 %rd33, %rd1, %rd31; + ld.global.nc.f32 %f103, [%rd33]; + div.rn.f32 %f104, %f103, %f102; + mul.f32 %f105, %f104, %f87; + fma.rn.f32 %f106, %f3, %f105, %f2; + mul.f32 %f107, %f2, %f105; + sub.f32 %f108, %f3, %f107; + selp.f32 %f109, %f1, %f254, %p11; + selp.f32 %f110, %f106, %f255, %p11; + selp.f32 %f111, %f108, %f256, %p11; + mul.f32 %f112, %f87, %f87; + div.rn.f32 %f113, %f102, %f112; + sub.f32 %f114, %f109, %f1; + sub.f32 %f115, %f110, %f2; + sub.f32 %f116, %f111, %f3; + fma.rn.f32 %f263, %f114, %f113, 0f00000000; + fma.rn.f32 %f117, %f115, %f113, 0f00000000; + fma.rn.f32 %f118, %f116, %f113, 0f00000000; + div.rn.f32 %f119, %f103, %f87; + mul.f32 %f120, %f111, %f119; + sub.f32 %f264, %f117, %f120; + fma.rn.f32 %f265, %f110, %f119, %f118; + +BB0_11: + add.s32 %r14, %r1, 1; + @%p7 bra BB0_13; + + rem.s32 %r67, %r14, %r43; + add.s32 %r68, %r67, %r43; + rem.s32 %r123, %r68, %r43; + bra.uni BB0_14; + +BB0_13: + add.s32 %r69, %r43, -1; + min.s32 %r123, %r14, %r69; + +BB0_14: + add.s32 %r18, %r123, %r5; + setp.ge.s32 %p18, %r14, %r43; + mov.f32 %f260, 0f00000000; + and.pred %p20, %p18, %p7; + mov.f32 %f261, %f260; + mov.f32 %f262, %f260; + @%p20 bra BB0_16; + + mul.wide.s32 %rd34, %r18, 4; + add.s64 %rd35, %rd6, %rd34; + ld.global.nc.f32 %f260, [%rd35]; + add.s64 %rd36, %rd5, %rd34; + ld.global.nc.f32 %f261, [%rd36]; + add.s64 %rd37, %rd4, %rd34; + ld.global.nc.f32 %f262, [%rd37]; + +BB0_16: + mul.f32 %f124, %f261, %f261; + fma.rn.f32 %f125, %f260, %f260, %f124; + fma.rn.f32 %f23, %f262, %f262, %f125; + setp.eq.f32 %p21, %f23, 0f00000000; + mov.u16 %rs38, %rs1; + @%p21 bra BB0_18; + + cvt.s64.s32 %rd38, %r18; + add.s64 %rd39, %rd3, %rd38; + ld.global.nc.u8 %rs38, [%rd39]; + +BB0_18: + setp.gt.u16 %p22, %rs38, %rs1; + cvt.u32.u16 %r70, %rs38; + and.b32 %r71, %r70, 255; + selp.b32 %r72, %r7, %r71, %p22; + selp.b32 %r73, %r71, %r7, %p22; + add.s32 %r74, %r73, 1; + mul.lo.s32 %r75, %r74, %r73; + shr.u32 %r76, %r75, 1; + add.s32 %r19, %r76, %r72; + and.pred %p25, %p21, %p13; + @%p25 bra BB0_20; + + mul.wide.s32 %rd40, %r19, 4; + add.s64 %rd41, %rd2, %rd40; + ld.global.nc.f32 %f126, [%rd41]; + add.f32 %f127, %f126, %f126; + add.s64 %rd42, %rd1, %rd40; + ld.global.nc.f32 %f128, [%rd42]; + div.rn.f32 %f129, %f128, %f127; + mul.f32 %f130, %f129, %f87; + mul.f32 %f131, %f3, %f130; + sub.f32 %f132, %f2, %f131; + fma.rn.f32 %f133, %f2, %f130, %f3; + selp.f32 %f134, %f1, %f260, %p21; + selp.f32 %f135, %f132, %f261, %p21; + selp.f32 %f136, %f133, %f262, %p21; + mul.f32 %f137, %f87, %f87; + div.rn.f32 %f138, %f127, %f137; + sub.f32 %f139, %f134, %f1; + sub.f32 %f140, %f135, %f2; + sub.f32 %f141, %f136, %f3; + fma.rn.f32 %f263, %f139, %f138, %f263; + fma.rn.f32 %f142, %f140, %f138, %f264; + fma.rn.f32 %f143, %f141, %f138, %f265; + div.rn.f32 %f144, %f128, %f87; + fma.rn.f32 %f264, %f136, %f144, %f142; + mul.f32 %f145, %f135, %f144; + sub.f32 %f265, %f143, %f145; + +BB0_20: + and.b16 %rs7, %rs17, 2; + setp.eq.s16 %p27, %rs7, 0; + add.s32 %r20, %r2, -1; + @%p27 bra BB0_22; + + rem.s32 %r77, %r20, %r44; + add.s32 %r78, %r77, %r44; + rem.s32 %r124, %r78, %r44; + bra.uni BB0_23; + +BB0_22: + mov.u32 %r79, 0; + max.s32 %r124, %r20, %r79; + +BB0_23: + add.s32 %r80, %r124, %r4; + mad.lo.s32 %r24, %r80, %r43, %r1; + setp.lt.s32 %p29, %r20, 0; + mov.f32 %f266, 0f00000000; + and.pred %p30, %p29, %p27; + mov.f32 %f267, %f266; + mov.f32 %f268, %f266; + @%p30 bra BB0_25; + + mul.wide.s32 %rd43, %r24, 4; + add.s64 %rd44, %rd6, %rd43; + ld.global.nc.f32 %f266, [%rd44]; + add.s64 %rd45, %rd5, %rd43; + ld.global.nc.f32 %f267, [%rd45]; + add.s64 %rd46, %rd4, %rd43; + ld.global.nc.f32 %f268, [%rd46]; + +BB0_25: + mul.f32 %f149, %f267, %f267; + fma.rn.f32 %f150, %f266, %f266, %f149; + fma.rn.f32 %f36, %f268, %f268, %f150; + setp.eq.f32 %p31, %f36, 0f00000000; + mov.u16 %rs39, %rs1; + @%p31 bra BB0_27; + + cvt.s64.s32 %rd47, %r24; + add.s64 %rd48, %rd3, %rd47; + ld.global.nc.u8 %rs39, [%rd48]; + +BB0_27: + setp.gt.u16 %p32, %rs39, %rs1; + cvt.u32.u16 %r81, %rs39; + and.b32 %r82, %r81, 255; + selp.b32 %r83, %r7, %r82, %p32; + selp.b32 %r84, %r82, %r7, %p32; + add.s32 %r85, %r84, 1; + mul.lo.s32 %r86, %r85, %r84; + shr.u32 %r87, %r86, 1; + add.s32 %r25, %r87, %r83; + and.pred %p35, %p31, %p13; + @%p35 bra BB0_29; + + mul.wide.s32 %rd49, %r25, 4; + add.s64 %rd50, %rd2, %rd49; + ld.global.nc.f32 %f151, [%rd50]; + add.f32 %f152, %f151, %f151; + add.s64 %rd51, %rd1, %rd49; + ld.global.nc.f32 %f153, [%rd51]; + div.rn.f32 %f154, %f153, %f152; + mul.f32 %f155, %f154, %f88; + mul.f32 %f156, %f3, %f155; + sub.f32 %f157, %f1, %f156; + fma.rn.f32 %f158, %f1, %f155, %f3; + selp.f32 %f159, %f157, %f266, %p31; + selp.f32 %f160, %f2, %f267, %p31; + selp.f32 %f161, %f158, %f268, %p31; + mul.f32 %f162, %f88, %f88; + div.rn.f32 %f163, %f152, %f162; + sub.f32 %f164, %f159, %f1; + sub.f32 %f165, %f160, %f2; + sub.f32 %f166, %f161, %f3; + fma.rn.f32 %f167, %f164, %f163, %f263; + fma.rn.f32 %f264, %f165, %f163, %f264; + fma.rn.f32 %f168, %f166, %f163, %f265; + div.rn.f32 %f169, %f153, %f88; + fma.rn.f32 %f263, %f161, %f169, %f167; + mul.f32 %f170, %f159, %f169; + sub.f32 %f265, %f168, %f170; + +BB0_29: + add.s32 %r26, %r2, 1; + @%p27 bra BB0_31; + + rem.s32 %r88, %r26, %r44; + add.s32 %r89, %r88, %r44; + rem.s32 %r125, %r89, %r44; + bra.uni BB0_32; + +BB0_31: + add.s32 %r90, %r44, -1; + min.s32 %r125, %r26, %r90; + +BB0_32: + add.s32 %r91, %r125, %r4; + mad.lo.s32 %r30, %r91, %r43, %r1; + setp.ge.s32 %p38, %r26, %r44; + mov.f32 %f272, 0f00000000; + and.pred %p40, %p38, %p27; + mov.f32 %f273, %f272; + mov.f32 %f274, %f272; + @%p40 bra BB0_34; + + mul.wide.s32 %rd52, %r30, 4; + add.s64 %rd53, %rd6, %rd52; + ld.global.nc.f32 %f272, [%rd53]; + add.s64 %rd54, %rd5, %rd52; + ld.global.nc.f32 %f273, [%rd54]; + add.s64 %rd55, %rd4, %rd52; + ld.global.nc.f32 %f274, [%rd55]; + +BB0_34: + mul.f32 %f174, %f273, %f273; + fma.rn.f32 %f175, %f272, %f272, %f174; + fma.rn.f32 %f49, %f274, %f274, %f175; + setp.eq.f32 %p41, %f49, 0f00000000; + mov.u16 %rs40, %rs1; + @%p41 bra BB0_36; + + cvt.s64.s32 %rd56, %r30; + add.s64 %rd57, %rd3, %rd56; + ld.global.nc.u8 %rs40, [%rd57]; + +BB0_36: + setp.gt.u16 %p42, %rs40, %rs1; + cvt.u32.u16 %r92, %rs40; + and.b32 %r93, %r92, 255; + selp.b32 %r94, %r7, %r93, %p42; + selp.b32 %r95, %r93, %r7, %p42; + add.s32 %r96, %r95, 1; + mul.lo.s32 %r97, %r96, %r95; + shr.u32 %r98, %r97, 1; + add.s32 %r31, %r98, %r94; + and.pred %p45, %p41, %p13; + @%p45 bra BB0_38; + + mul.wide.s32 %rd58, %r31, 4; + add.s64 %rd59, %rd2, %rd58; + ld.global.nc.f32 %f176, [%rd59]; + add.f32 %f177, %f176, %f176; + add.s64 %rd60, %rd1, %rd58; + ld.global.nc.f32 %f178, [%rd60]; + div.rn.f32 %f179, %f178, %f177; + mul.f32 %f180, %f179, %f88; + fma.rn.f32 %f181, %f3, %f180, %f1; + mul.f32 %f182, %f1, %f180; + sub.f32 %f183, %f3, %f182; + selp.f32 %f184, %f181, %f272, %p41; + selp.f32 %f185, %f2, %f273, %p41; + selp.f32 %f186, %f183, %f274, %p41; + mul.f32 %f187, %f88, %f88; + div.rn.f32 %f188, %f177, %f187; + sub.f32 %f189, %f184, %f1; + sub.f32 %f190, %f185, %f2; + sub.f32 %f191, %f186, %f3; + fma.rn.f32 %f192, %f189, %f188, %f263; + fma.rn.f32 %f264, %f190, %f188, %f264; + fma.rn.f32 %f193, %f191, %f188, %f265; + div.rn.f32 %f194, %f178, %f88; + mul.f32 %f195, %f186, %f194; + sub.f32 %f263, %f192, %f195; + fma.rn.f32 %f265, %f184, %f194, %f193; + +BB0_38: + setp.eq.s32 %p47, %r45, 1; + @%p47 bra BB0_57; + + and.b16 %rs12, %rs17, 4; + setp.eq.s16 %p48, %rs12, 0; + add.s32 %r32, %r3, -1; + @%p48 bra BB0_41; + + rem.s32 %r99, %r32, %r45; + add.s32 %r100, %r99, %r45; + rem.s32 %r126, %r100, %r45; + bra.uni BB0_42; + +BB0_41: + mov.u32 %r101, 0; + max.s32 %r126, %r32, %r101; + +BB0_42: + mad.lo.s32 %r102, %r126, %r44, %r2; + mad.lo.s32 %r36, %r102, %r43, %r1; + setp.lt.s32 %p50, %r32, 0; + mov.f32 %f278, 0f00000000; + and.pred %p51, %p50, %p48; + mov.f32 %f279, %f278; + mov.f32 %f280, %f278; + @%p51 bra BB0_44; + + mul.wide.s32 %rd61, %r36, 4; + add.s64 %rd62, %rd6, %rd61; + ld.global.nc.f32 %f278, [%rd62]; + add.s64 %rd63, %rd5, %rd61; + ld.global.nc.f32 %f279, [%rd63]; + add.s64 %rd64, %rd4, %rd61; + ld.global.nc.f32 %f280, [%rd64]; + +BB0_44: + mul.f32 %f199, %f279, %f279; + fma.rn.f32 %f200, %f278, %f278, %f199; + fma.rn.f32 %f62, %f280, %f280, %f200; + setp.eq.f32 %p52, %f62, 0f00000000; + mov.u16 %rs41, %rs1; + @%p52 bra BB0_46; + + cvt.s64.s32 %rd65, %r36; + add.s64 %rd66, %rd3, %rd65; + ld.global.nc.u8 %rs41, [%rd66]; + +BB0_46: + setp.gt.u16 %p53, %rs41, %rs1; + cvt.u32.u16 %r103, %rs41; + and.b32 %r104, %r103, 255; + selp.b32 %r105, %r7, %r104, %p53; + selp.b32 %r106, %r104, %r7, %p53; + add.s32 %r107, %r106, 1; + mul.lo.s32 %r108, %r107, %r106; + shr.u32 %r109, %r108, 1; + add.s32 %r37, %r109, %r105; + and.pred %p56, %p52, %p13; + @%p56 bra BB0_48; + + mul.wide.s32 %rd67, %r37, 4; + add.s64 %rd68, %rd2, %rd67; + ld.global.nc.f32 %f201, [%rd68]; + add.f32 %f202, %f201, %f201; + add.s64 %rd69, %rd1, %rd67; + ld.global.nc.f32 %f203, [%rd69]; + div.rn.f32 %f204, %f203, %f202; + mul.f32 %f205, %f204, %f89; + fma.rn.f32 %f206, %f2, %f205, %f1; + mul.f32 %f207, %f1, %f205; + sub.f32 %f208, %f2, %f207; + selp.f32 %f209, %f206, %f278, %p52; + selp.f32 %f210, %f208, %f279, %p52; + selp.f32 %f211, %f3, %f280, %p52; + mul.f32 %f212, %f89, %f89; + div.rn.f32 %f213, %f202, %f212; + sub.f32 %f214, %f209, %f1; + sub.f32 %f215, %f210, %f2; + sub.f32 %f216, %f211, %f3; + fma.rn.f32 %f217, %f214, %f213, %f263; + fma.rn.f32 %f218, %f215, %f213, %f264; + fma.rn.f32 %f265, %f216, %f213, %f265; + div.rn.f32 %f219, %f203, %f89; + mul.f32 %f220, %f210, %f219; + sub.f32 %f263, %f217, %f220; + fma.rn.f32 %f264, %f209, %f219, %f218; + +BB0_48: + add.s32 %r38, %r3, 1; + @%p48 bra BB0_50; + + rem.s32 %r110, %r38, %r45; + add.s32 %r111, %r110, %r45; + rem.s32 %r127, %r111, %r45; + bra.uni BB0_51; + +BB0_50: + add.s32 %r112, %r45, -1; + min.s32 %r127, %r38, %r112; + +BB0_51: + mad.lo.s32 %r113, %r127, %r44, %r2; + mad.lo.s32 %r42, %r113, %r43, %r1; + setp.ge.s32 %p59, %r38, %r45; + mov.f32 %f284, 0f00000000; + and.pred %p61, %p59, %p48; + mov.f32 %f285, %f284; + mov.f32 %f286, %f284; + @%p61 bra BB0_53; + + mul.wide.s32 %rd70, %r42, 4; + add.s64 %rd71, %rd6, %rd70; + ld.global.nc.f32 %f286, [%rd71]; + add.s64 %rd72, %rd5, %rd70; + ld.global.nc.f32 %f285, [%rd72]; + add.s64 %rd73, %rd4, %rd70; + ld.global.nc.f32 %f284, [%rd73]; + +BB0_53: + mul.f32 %f224, %f286, %f286; + fma.rn.f32 %f225, %f285, %f285, %f224; + fma.rn.f32 %f75, %f284, %f284, %f225; + setp.eq.f32 %p62, %f75, 0f00000000; + mov.u16 %rs42, %rs1; + @%p62 bra BB0_55; + + cvt.s64.s32 %rd74, %r42; + add.s64 %rd75, %rd3, %rd74; + ld.global.nc.u8 %rs42, [%rd75]; + +BB0_55: + setp.gt.u16 %p63, %rs42, %rs1; + cvt.u32.u16 %r114, %rs42; + and.b32 %r115, %r114, 255; + selp.b32 %r116, %r7, %r115, %p63; + selp.b32 %r117, %r115, %r7, %p63; + add.s32 %r118, %r117, 1; + mul.lo.s32 %r119, %r118, %r117; + shr.u32 %r120, %r119, 1; + add.s32 %r121, %r120, %r116; + mul.wide.s32 %rd76, %r121, 4; + add.s64 %rd7, %rd2, %rd76; + add.s64 %rd8, %rd1, %rd76; + and.pred %p66, %p62, %p13; + @%p66 bra BB0_57; + + ld.global.nc.f32 %f226, [%rd7]; + add.f32 %f227, %f226, %f226; + ld.global.nc.f32 %f228, [%rd8]; + div.rn.f32 %f229, %f228, %f227; + mul.f32 %f230, %f229, %f89; + mul.f32 %f231, %f2, %f230; + sub.f32 %f232, %f1, %f231; + fma.rn.f32 %f233, %f1, %f230, %f2; + selp.f32 %f234, %f3, %f284, %p62; + selp.f32 %f235, %f233, %f285, %p62; + selp.f32 %f236, %f232, %f286, %p62; + mul.f32 %f237, %f89, %f89; + div.rn.f32 %f238, %f227, %f237; + sub.f32 %f239, %f236, %f1; + sub.f32 %f240, %f235, %f2; + sub.f32 %f241, %f234, %f3; + fma.rn.f32 %f242, %f239, %f238, %f263; + fma.rn.f32 %f243, %f240, %f238, %f264; + fma.rn.f32 %f265, %f241, %f238, %f265; + div.rn.f32 %f244, %f228, %f89; + fma.rn.f32 %f263, %f235, %f244, %f242; + mul.f32 %f245, %f236, %f244; + sub.f32 %f264, %f243, %f245; + +BB0_57: + setp.eq.s64 %p68, %rd12, 0; + @%p68 bra BB0_59; + + cvta.to.global.u64 %rd77, %rd12; + add.s64 %rd79, %rd77, %rd19; + ld.global.nc.f32 %f246, [%rd79]; + mul.f32 %f290, %f246, %f290; + +BB0_59: + setp.eq.f32 %p69, %f290, 0f00000000; + mov.f32 %f291, 0f00000000; + @%p69 bra BB0_61; + + rcp.rn.f32 %f291, %f290; + +BB0_61: + cvta.to.global.u64 %rd80, %rd11; + cvta.to.global.u64 %rd81, %rd10; + cvta.to.global.u64 %rd82, %rd9; + add.s64 %rd84, %rd82, %rd19; + ld.global.f32 %f248, [%rd84]; + fma.rn.f32 %f249, %f263, %f291, %f248; + st.global.f32 [%rd84], %f249; + add.s64 %rd85, %rd81, %rd19; + ld.global.f32 %f250, [%rd85]; + fma.rn.f32 %f251, %f264, %f291, %f250; + st.global.f32 [%rd85], %f251; + add.s64 %rd86, %rd80, %rd19; + ld.global.f32 %f252, [%rd86]; + fma.rn.f32 %f253, %f265, %f291, %f252; + st.global.f32 [%rd86], %f253; + +BB0_62: + ret; +} + + +` + ) diff --git a/cuda/dotproduct_wrapper.go b/cuda/dotproduct_wrapper.go index 65a1da8d0..a318c7861 100644 --- a/cuda/dotproduct_wrapper.go +++ b/cuda/dotproduct_wrapper.go @@ -5,50 +5,50 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for dotproduct kernel var dotproduct_code cu.Function // Stores the arguments for dotproduct kernel invocation -type dotproduct_args_t struct { - arg_dst unsafe.Pointer - arg_prefactor float32 - arg_ax unsafe.Pointer - arg_ay unsafe.Pointer - arg_az unsafe.Pointer - arg_bx unsafe.Pointer - arg_by unsafe.Pointer - arg_bz unsafe.Pointer - arg_N int - argptr [9]unsafe.Pointer +type dotproduct_args_t struct{ + arg_dst unsafe.Pointer + arg_prefactor float32 + arg_ax unsafe.Pointer + arg_ay unsafe.Pointer + arg_az unsafe.Pointer + arg_bx unsafe.Pointer + arg_by unsafe.Pointer + arg_bz unsafe.Pointer + arg_N int + argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for dotproduct kernel invocation var dotproduct_args dotproduct_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - dotproduct_args.argptr[0] = unsafe.Pointer(&dotproduct_args.arg_dst) - dotproduct_args.argptr[1] = unsafe.Pointer(&dotproduct_args.arg_prefactor) - dotproduct_args.argptr[2] = unsafe.Pointer(&dotproduct_args.arg_ax) - dotproduct_args.argptr[3] = unsafe.Pointer(&dotproduct_args.arg_ay) - dotproduct_args.argptr[4] = unsafe.Pointer(&dotproduct_args.arg_az) - dotproduct_args.argptr[5] = unsafe.Pointer(&dotproduct_args.arg_bx) - dotproduct_args.argptr[6] = unsafe.Pointer(&dotproduct_args.arg_by) - dotproduct_args.argptr[7] = unsafe.Pointer(&dotproduct_args.arg_bz) - dotproduct_args.argptr[8] = unsafe.Pointer(&dotproduct_args.arg_N) -} + dotproduct_args.argptr[0] = unsafe.Pointer(&dotproduct_args.arg_dst) + dotproduct_args.argptr[1] = unsafe.Pointer(&dotproduct_args.arg_prefactor) + dotproduct_args.argptr[2] = unsafe.Pointer(&dotproduct_args.arg_ax) + dotproduct_args.argptr[3] = unsafe.Pointer(&dotproduct_args.arg_ay) + dotproduct_args.argptr[4] = unsafe.Pointer(&dotproduct_args.arg_az) + dotproduct_args.argptr[5] = unsafe.Pointer(&dotproduct_args.arg_bx) + dotproduct_args.argptr[6] = unsafe.Pointer(&dotproduct_args.arg_by) + dotproduct_args.argptr[7] = unsafe.Pointer(&dotproduct_args.arg_bz) + dotproduct_args.argptr[8] = unsafe.Pointer(&dotproduct_args.arg_N) + } // Wrapper for dotproduct CUDA kernel, asynchronous. -func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_dotproduct_async ( dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("dotproduct") } @@ -56,45 +56,46 @@ func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer dotproduct_args.Lock() defer dotproduct_args.Unlock() - if dotproduct_code == 0 { + if dotproduct_code == 0{ dotproduct_code = fatbinLoad(dotproduct_map, "dotproduct") } - dotproduct_args.arg_dst = dst - dotproduct_args.arg_prefactor = prefactor - dotproduct_args.arg_ax = ax - dotproduct_args.arg_ay = ay - dotproduct_args.arg_az = az - dotproduct_args.arg_bx = bx - dotproduct_args.arg_by = by - dotproduct_args.arg_bz = bz - dotproduct_args.arg_N = N + dotproduct_args.arg_dst = dst + dotproduct_args.arg_prefactor = prefactor + dotproduct_args.arg_ax = ax + dotproduct_args.arg_ay = ay + dotproduct_args.arg_az = az + dotproduct_args.arg_bx = bx + dotproduct_args.arg_by = by + dotproduct_args.arg_bz = bz + dotproduct_args.arg_N = N + args := dotproduct_args.argptr[:] cu.LaunchKernel(dotproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("dotproduct") } } // maps compute capability on PTX code for dotproduct kernel. -var dotproduct_map = map[int]string{0: "", - 30: dotproduct_ptx_30, - 35: dotproduct_ptx_35, - 37: dotproduct_ptx_37, - 50: dotproduct_ptx_50, - 52: dotproduct_ptx_52, - 53: dotproduct_ptx_53, - 60: dotproduct_ptx_60, - 61: dotproduct_ptx_61, - 70: dotproduct_ptx_70, - 75: dotproduct_ptx_75} +var dotproduct_map = map[int]string{ 0: "" , +30: dotproduct_ptx_30 , +35: dotproduct_ptx_35 , +37: dotproduct_ptx_37 , +50: dotproduct_ptx_50 , +52: dotproduct_ptx_52 , +53: dotproduct_ptx_53 , +60: dotproduct_ptx_60 , +61: dotproduct_ptx_61 , +70: dotproduct_ptx_70 , +75: dotproduct_ptx_75 } // dotproduct PTX code for various compute capabilities. -const ( - dotproduct_ptx_30 = ` +const( + dotproduct_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -172,7 +173,7 @@ BB0_2: ` - dotproduct_ptx_35 = ` + dotproduct_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -250,7 +251,7 @@ BB0_2: ` - dotproduct_ptx_37 = ` + dotproduct_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -328,7 +329,7 @@ BB0_2: ` - dotproduct_ptx_50 = ` + dotproduct_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -406,7 +407,7 @@ BB0_2: ` - dotproduct_ptx_52 = ` + dotproduct_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -484,7 +485,7 @@ BB0_2: ` - dotproduct_ptx_53 = ` + dotproduct_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -562,7 +563,7 @@ BB0_2: ` - dotproduct_ptx_60 = ` + dotproduct_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -640,7 +641,7 @@ BB0_2: ` - dotproduct_ptx_61 = ` + dotproduct_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -718,7 +719,7 @@ BB0_2: ` - dotproduct_ptx_70 = ` + dotproduct_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -796,7 +797,7 @@ BB0_2: ` - dotproduct_ptx_75 = ` + dotproduct_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -874,4 +875,4 @@ BB0_2: ` -) + ) diff --git a/cuda/exchange_wrapper.go b/cuda/exchange_wrapper.go index 014b5e0ef..e0e3b9960 100644 --- a/cuda/exchange_wrapper.go +++ b/cuda/exchange_wrapper.go @@ -5,66 +5,66 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for addexchange kernel var addexchange_code cu.Function // Stores the arguments for addexchange kernel invocation -type addexchange_args_t struct { - arg_Bx unsafe.Pointer - arg_By unsafe.Pointer - arg_Bz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_aLUT2d unsafe.Pointer - arg_regions unsafe.Pointer - arg_wx float32 - arg_wy float32 - arg_wz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [17]unsafe.Pointer +type addexchange_args_t struct{ + arg_Bx unsafe.Pointer + arg_By unsafe.Pointer + arg_Bz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_aLUT2d unsafe.Pointer + arg_regions unsafe.Pointer + arg_wx float32 + arg_wy float32 + arg_wz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for addexchange kernel invocation var addexchange_args addexchange_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - addexchange_args.argptr[0] = unsafe.Pointer(&addexchange_args.arg_Bx) - addexchange_args.argptr[1] = unsafe.Pointer(&addexchange_args.arg_By) - addexchange_args.argptr[2] = unsafe.Pointer(&addexchange_args.arg_Bz) - addexchange_args.argptr[3] = unsafe.Pointer(&addexchange_args.arg_mx) - addexchange_args.argptr[4] = unsafe.Pointer(&addexchange_args.arg_my) - addexchange_args.argptr[5] = unsafe.Pointer(&addexchange_args.arg_mz) - addexchange_args.argptr[6] = unsafe.Pointer(&addexchange_args.arg_Ms_) - addexchange_args.argptr[7] = unsafe.Pointer(&addexchange_args.arg_Ms_mul) - addexchange_args.argptr[8] = unsafe.Pointer(&addexchange_args.arg_aLUT2d) - addexchange_args.argptr[9] = unsafe.Pointer(&addexchange_args.arg_regions) - addexchange_args.argptr[10] = unsafe.Pointer(&addexchange_args.arg_wx) - addexchange_args.argptr[11] = unsafe.Pointer(&addexchange_args.arg_wy) - addexchange_args.argptr[12] = unsafe.Pointer(&addexchange_args.arg_wz) - addexchange_args.argptr[13] = unsafe.Pointer(&addexchange_args.arg_Nx) - addexchange_args.argptr[14] = unsafe.Pointer(&addexchange_args.arg_Ny) - addexchange_args.argptr[15] = unsafe.Pointer(&addexchange_args.arg_Nz) - addexchange_args.argptr[16] = unsafe.Pointer(&addexchange_args.arg_PBC) -} + addexchange_args.argptr[0] = unsafe.Pointer(&addexchange_args.arg_Bx) + addexchange_args.argptr[1] = unsafe.Pointer(&addexchange_args.arg_By) + addexchange_args.argptr[2] = unsafe.Pointer(&addexchange_args.arg_Bz) + addexchange_args.argptr[3] = unsafe.Pointer(&addexchange_args.arg_mx) + addexchange_args.argptr[4] = unsafe.Pointer(&addexchange_args.arg_my) + addexchange_args.argptr[5] = unsafe.Pointer(&addexchange_args.arg_mz) + addexchange_args.argptr[6] = unsafe.Pointer(&addexchange_args.arg_Ms_) + addexchange_args.argptr[7] = unsafe.Pointer(&addexchange_args.arg_Ms_mul) + addexchange_args.argptr[8] = unsafe.Pointer(&addexchange_args.arg_aLUT2d) + addexchange_args.argptr[9] = unsafe.Pointer(&addexchange_args.arg_regions) + addexchange_args.argptr[10] = unsafe.Pointer(&addexchange_args.arg_wx) + addexchange_args.argptr[11] = unsafe.Pointer(&addexchange_args.arg_wy) + addexchange_args.argptr[12] = unsafe.Pointer(&addexchange_args.arg_wz) + addexchange_args.argptr[13] = unsafe.Pointer(&addexchange_args.arg_Nx) + addexchange_args.argptr[14] = unsafe.Pointer(&addexchange_args.arg_Ny) + addexchange_args.argptr[15] = unsafe.Pointer(&addexchange_args.arg_Nz) + addexchange_args.argptr[16] = unsafe.Pointer(&addexchange_args.arg_PBC) + } // Wrapper for addexchange CUDA kernel, asynchronous. -func k_addexchange_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_addexchange_async ( Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("addexchange") } @@ -72,53 +72,54 @@ func k_addexchange_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer addexchange_args.Lock() defer addexchange_args.Unlock() - if addexchange_code == 0 { + if addexchange_code == 0{ addexchange_code = fatbinLoad(addexchange_map, "addexchange") } - addexchange_args.arg_Bx = Bx - addexchange_args.arg_By = By - addexchange_args.arg_Bz = Bz - addexchange_args.arg_mx = mx - addexchange_args.arg_my = my - addexchange_args.arg_mz = mz - addexchange_args.arg_Ms_ = Ms_ - addexchange_args.arg_Ms_mul = Ms_mul - addexchange_args.arg_aLUT2d = aLUT2d - addexchange_args.arg_regions = regions - addexchange_args.arg_wx = wx - addexchange_args.arg_wy = wy - addexchange_args.arg_wz = wz - addexchange_args.arg_Nx = Nx - addexchange_args.arg_Ny = Ny - addexchange_args.arg_Nz = Nz - addexchange_args.arg_PBC = PBC + addexchange_args.arg_Bx = Bx + addexchange_args.arg_By = By + addexchange_args.arg_Bz = Bz + addexchange_args.arg_mx = mx + addexchange_args.arg_my = my + addexchange_args.arg_mz = mz + addexchange_args.arg_Ms_ = Ms_ + addexchange_args.arg_Ms_mul = Ms_mul + addexchange_args.arg_aLUT2d = aLUT2d + addexchange_args.arg_regions = regions + addexchange_args.arg_wx = wx + addexchange_args.arg_wy = wy + addexchange_args.arg_wz = wz + addexchange_args.arg_Nx = Nx + addexchange_args.arg_Ny = Ny + addexchange_args.arg_Nz = Nz + addexchange_args.arg_PBC = PBC + args := addexchange_args.argptr[:] cu.LaunchKernel(addexchange_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("addexchange") } } // maps compute capability on PTX code for addexchange kernel. -var addexchange_map = map[int]string{0: "", - 30: addexchange_ptx_30, - 35: addexchange_ptx_35, - 37: addexchange_ptx_37, - 50: addexchange_ptx_50, - 52: addexchange_ptx_52, - 53: addexchange_ptx_53, - 60: addexchange_ptx_60, - 61: addexchange_ptx_61, - 70: addexchange_ptx_70, - 75: addexchange_ptx_75} +var addexchange_map = map[int]string{ 0: "" , +30: addexchange_ptx_30 , +35: addexchange_ptx_35 , +37: addexchange_ptx_37 , +50: addexchange_ptx_50 , +52: addexchange_ptx_52 , +53: addexchange_ptx_53 , +60: addexchange_ptx_60 , +61: addexchange_ptx_61 , +70: addexchange_ptx_70 , +75: addexchange_ptx_75 } // addexchange PTX code for various compute capabilities. -const ( - addexchange_ptx_30 = ` +const( + addexchange_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -567,7 +568,7 @@ BB0_27: ` - addexchange_ptx_35 = ` + addexchange_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1017,7 +1018,7 @@ BB0_27: ` - addexchange_ptx_37 = ` + addexchange_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1467,7 +1468,7 @@ BB0_27: ` - addexchange_ptx_50 = ` + addexchange_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1917,7 +1918,7 @@ BB0_27: ` - addexchange_ptx_52 = ` + addexchange_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -2367,7 +2368,7 @@ BB0_27: ` - addexchange_ptx_53 = ` + addexchange_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -2817,7 +2818,7 @@ BB0_27: ` - addexchange_ptx_60 = ` + addexchange_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -3267,7 +3268,7 @@ BB0_27: ` - addexchange_ptx_61 = ` + addexchange_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -3717,7 +3718,7 @@ BB0_27: ` - addexchange_ptx_70 = ` + addexchange_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -4167,7 +4168,7 @@ BB0_27: ` - addexchange_ptx_75 = ` + addexchange_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -4617,4 +4618,4 @@ BB0_27: ` -) + ) diff --git a/cuda/exchangedecode_wrapper.go b/cuda/exchangedecode_wrapper.go index 89bbf4c6a..83dc2b95b 100644 --- a/cuda/exchangedecode_wrapper.go +++ b/cuda/exchangedecode_wrapper.go @@ -5,52 +5,52 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for exchangedecode kernel var exchangedecode_code cu.Function // Stores the arguments for exchangedecode kernel invocation -type exchangedecode_args_t struct { - arg_dst unsafe.Pointer - arg_aLUT2d unsafe.Pointer - arg_regions unsafe.Pointer - arg_wx float32 - arg_wy float32 - arg_wz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [10]unsafe.Pointer +type exchangedecode_args_t struct{ + arg_dst unsafe.Pointer + arg_aLUT2d unsafe.Pointer + arg_regions unsafe.Pointer + arg_wx float32 + arg_wy float32 + arg_wz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for exchangedecode kernel invocation var exchangedecode_args exchangedecode_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - exchangedecode_args.argptr[0] = unsafe.Pointer(&exchangedecode_args.arg_dst) - exchangedecode_args.argptr[1] = unsafe.Pointer(&exchangedecode_args.arg_aLUT2d) - exchangedecode_args.argptr[2] = unsafe.Pointer(&exchangedecode_args.arg_regions) - exchangedecode_args.argptr[3] = unsafe.Pointer(&exchangedecode_args.arg_wx) - exchangedecode_args.argptr[4] = unsafe.Pointer(&exchangedecode_args.arg_wy) - exchangedecode_args.argptr[5] = unsafe.Pointer(&exchangedecode_args.arg_wz) - exchangedecode_args.argptr[6] = unsafe.Pointer(&exchangedecode_args.arg_Nx) - exchangedecode_args.argptr[7] = unsafe.Pointer(&exchangedecode_args.arg_Ny) - exchangedecode_args.argptr[8] = unsafe.Pointer(&exchangedecode_args.arg_Nz) - exchangedecode_args.argptr[9] = unsafe.Pointer(&exchangedecode_args.arg_PBC) -} + exchangedecode_args.argptr[0] = unsafe.Pointer(&exchangedecode_args.arg_dst) + exchangedecode_args.argptr[1] = unsafe.Pointer(&exchangedecode_args.arg_aLUT2d) + exchangedecode_args.argptr[2] = unsafe.Pointer(&exchangedecode_args.arg_regions) + exchangedecode_args.argptr[3] = unsafe.Pointer(&exchangedecode_args.arg_wx) + exchangedecode_args.argptr[4] = unsafe.Pointer(&exchangedecode_args.arg_wy) + exchangedecode_args.argptr[5] = unsafe.Pointer(&exchangedecode_args.arg_wz) + exchangedecode_args.argptr[6] = unsafe.Pointer(&exchangedecode_args.arg_Nx) + exchangedecode_args.argptr[7] = unsafe.Pointer(&exchangedecode_args.arg_Ny) + exchangedecode_args.argptr[8] = unsafe.Pointer(&exchangedecode_args.arg_Nz) + exchangedecode_args.argptr[9] = unsafe.Pointer(&exchangedecode_args.arg_PBC) + } // Wrapper for exchangedecode CUDA kernel, asynchronous. -func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_exchangedecode_async ( dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("exchangedecode") } @@ -58,46 +58,47 @@ func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions u exchangedecode_args.Lock() defer exchangedecode_args.Unlock() - if exchangedecode_code == 0 { + if exchangedecode_code == 0{ exchangedecode_code = fatbinLoad(exchangedecode_map, "exchangedecode") } - exchangedecode_args.arg_dst = dst - exchangedecode_args.arg_aLUT2d = aLUT2d - exchangedecode_args.arg_regions = regions - exchangedecode_args.arg_wx = wx - exchangedecode_args.arg_wy = wy - exchangedecode_args.arg_wz = wz - exchangedecode_args.arg_Nx = Nx - exchangedecode_args.arg_Ny = Ny - exchangedecode_args.arg_Nz = Nz - exchangedecode_args.arg_PBC = PBC + exchangedecode_args.arg_dst = dst + exchangedecode_args.arg_aLUT2d = aLUT2d + exchangedecode_args.arg_regions = regions + exchangedecode_args.arg_wx = wx + exchangedecode_args.arg_wy = wy + exchangedecode_args.arg_wz = wz + exchangedecode_args.arg_Nx = Nx + exchangedecode_args.arg_Ny = Ny + exchangedecode_args.arg_Nz = Nz + exchangedecode_args.arg_PBC = PBC + args := exchangedecode_args.argptr[:] cu.LaunchKernel(exchangedecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("exchangedecode") } } // maps compute capability on PTX code for exchangedecode kernel. -var exchangedecode_map = map[int]string{0: "", - 30: exchangedecode_ptx_30, - 35: exchangedecode_ptx_35, - 37: exchangedecode_ptx_37, - 50: exchangedecode_ptx_50, - 52: exchangedecode_ptx_52, - 53: exchangedecode_ptx_53, - 60: exchangedecode_ptx_60, - 61: exchangedecode_ptx_61, - 70: exchangedecode_ptx_70, - 75: exchangedecode_ptx_75} +var exchangedecode_map = map[int]string{ 0: "" , +30: exchangedecode_ptx_30 , +35: exchangedecode_ptx_35 , +37: exchangedecode_ptx_37 , +50: exchangedecode_ptx_50 , +52: exchangedecode_ptx_52 , +53: exchangedecode_ptx_53 , +60: exchangedecode_ptx_60 , +61: exchangedecode_ptx_61 , +70: exchangedecode_ptx_70 , +75: exchangedecode_ptx_75 } // exchangedecode PTX code for various compute capabilities. -const ( - exchangedecode_ptx_30 = ` +const( + exchangedecode_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -361,7 +362,7 @@ BB0_22: ` - exchangedecode_ptx_35 = ` + exchangedecode_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -631,7 +632,7 @@ BB0_22: ` - exchangedecode_ptx_37 = ` + exchangedecode_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -901,7 +902,7 @@ BB0_22: ` - exchangedecode_ptx_50 = ` + exchangedecode_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1171,7 +1172,7 @@ BB0_22: ` - exchangedecode_ptx_52 = ` + exchangedecode_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1441,7 +1442,7 @@ BB0_22: ` - exchangedecode_ptx_53 = ` + exchangedecode_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -1711,7 +1712,7 @@ BB0_22: ` - exchangedecode_ptx_60 = ` + exchangedecode_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1981,7 +1982,7 @@ BB0_22: ` - exchangedecode_ptx_61 = ` + exchangedecode_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -2251,7 +2252,7 @@ BB0_22: ` - exchangedecode_ptx_70 = ` + exchangedecode_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -2521,7 +2522,7 @@ BB0_22: ` - exchangedecode_ptx_75 = ` + exchangedecode_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -2791,4 +2792,4 @@ BB0_22: ` -) + ) diff --git a/cuda/kernmulc_wrapper.go b/cuda/kernmulc_wrapper.go index caa219fb1..bf9a86c85 100644 --- a/cuda/kernmulc_wrapper.go +++ b/cuda/kernmulc_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for kernmulC kernel var kernmulC_code cu.Function // Stores the arguments for kernmulC kernel invocation -type kernmulC_args_t struct { - arg_fftM unsafe.Pointer - arg_fftK unsafe.Pointer - arg_Nx int - arg_Ny int - argptr [4]unsafe.Pointer +type kernmulC_args_t struct{ + arg_fftM unsafe.Pointer + arg_fftK unsafe.Pointer + arg_Nx int + arg_Ny int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulC kernel invocation var kernmulC_args kernmulC_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - kernmulC_args.argptr[0] = unsafe.Pointer(&kernmulC_args.arg_fftM) - kernmulC_args.argptr[1] = unsafe.Pointer(&kernmulC_args.arg_fftK) - kernmulC_args.argptr[2] = unsafe.Pointer(&kernmulC_args.arg_Nx) - kernmulC_args.argptr[3] = unsafe.Pointer(&kernmulC_args.arg_Ny) -} + kernmulC_args.argptr[0] = unsafe.Pointer(&kernmulC_args.arg_fftM) + kernmulC_args.argptr[1] = unsafe.Pointer(&kernmulC_args.arg_fftK) + kernmulC_args.argptr[2] = unsafe.Pointer(&kernmulC_args.arg_Nx) + kernmulC_args.argptr[3] = unsafe.Pointer(&kernmulC_args.arg_Ny) + } // Wrapper for kernmulC CUDA kernel, asynchronous. -func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) { - if Synchronous { // debug +func k_kernmulC_async ( fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("kernmulC") } @@ -46,40 +46,41 @@ func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, kernmulC_args.Lock() defer kernmulC_args.Unlock() - if kernmulC_code == 0 { + if kernmulC_code == 0{ kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC") } - kernmulC_args.arg_fftM = fftM - kernmulC_args.arg_fftK = fftK - kernmulC_args.arg_Nx = Nx - kernmulC_args.arg_Ny = Ny + kernmulC_args.arg_fftM = fftM + kernmulC_args.arg_fftK = fftK + kernmulC_args.arg_Nx = Nx + kernmulC_args.arg_Ny = Ny + args := kernmulC_args.argptr[:] cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("kernmulC") } } // maps compute capability on PTX code for kernmulC kernel. -var kernmulC_map = map[int]string{0: "", - 30: kernmulC_ptx_30, - 35: kernmulC_ptx_35, - 37: kernmulC_ptx_37, - 50: kernmulC_ptx_50, - 52: kernmulC_ptx_52, - 53: kernmulC_ptx_53, - 60: kernmulC_ptx_60, - 61: kernmulC_ptx_61, - 70: kernmulC_ptx_70, - 75: kernmulC_ptx_75} +var kernmulC_map = map[int]string{ 0: "" , +30: kernmulC_ptx_30 , +35: kernmulC_ptx_35 , +37: kernmulC_ptx_37 , +50: kernmulC_ptx_50 , +52: kernmulC_ptx_52 , +53: kernmulC_ptx_53 , +60: kernmulC_ptx_60 , +61: kernmulC_ptx_61 , +70: kernmulC_ptx_70 , +75: kernmulC_ptx_75 } // kernmulC PTX code for various compute capabilities. -const ( - kernmulC_ptx_30 = ` +const( + kernmulC_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -141,7 +142,7 @@ BB0_2: ` - kernmulC_ptx_35 = ` + kernmulC_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -203,7 +204,7 @@ BB0_2: ` - kernmulC_ptx_37 = ` + kernmulC_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -265,7 +266,7 @@ BB0_2: ` - kernmulC_ptx_50 = ` + kernmulC_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -327,7 +328,7 @@ BB0_2: ` - kernmulC_ptx_52 = ` + kernmulC_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -389,7 +390,7 @@ BB0_2: ` - kernmulC_ptx_53 = ` + kernmulC_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -451,7 +452,7 @@ BB0_2: ` - kernmulC_ptx_60 = ` + kernmulC_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -513,7 +514,7 @@ BB0_2: ` - kernmulC_ptx_61 = ` + kernmulC_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -575,7 +576,7 @@ BB0_2: ` - kernmulC_ptx_70 = ` + kernmulC_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -637,7 +638,7 @@ BB0_2: ` - kernmulC_ptx_75 = ` + kernmulC_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -699,4 +700,4 @@ BB0_2: ` -) + ) diff --git a/cuda/kernmulrsymm2dxy_wrapper.go b/cuda/kernmulrsymm2dxy_wrapper.go index 7e2c1c18a..2fc58d609 100644 --- a/cuda/kernmulrsymm2dxy_wrapper.go +++ b/cuda/kernmulrsymm2dxy_wrapper.go @@ -5,46 +5,46 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for kernmulRSymm2Dxy kernel var kernmulRSymm2Dxy_code cu.Function // Stores the arguments for kernmulRSymm2Dxy kernel invocation -type kernmulRSymm2Dxy_args_t struct { - arg_fftMx unsafe.Pointer - arg_fftMy unsafe.Pointer - arg_fftKxx unsafe.Pointer - arg_fftKyy unsafe.Pointer - arg_fftKxy unsafe.Pointer - arg_Nx int - arg_Ny int - argptr [7]unsafe.Pointer +type kernmulRSymm2Dxy_args_t struct{ + arg_fftMx unsafe.Pointer + arg_fftMy unsafe.Pointer + arg_fftKxx unsafe.Pointer + arg_fftKyy unsafe.Pointer + arg_fftKxy unsafe.Pointer + arg_Nx int + arg_Ny int + argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dxy kernel invocation var kernmulRSymm2Dxy_args kernmulRSymm2Dxy_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - kernmulRSymm2Dxy_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMx) - kernmulRSymm2Dxy_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMy) - kernmulRSymm2Dxy_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxx) - kernmulRSymm2Dxy_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKyy) - kernmulRSymm2Dxy_args.argptr[4] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxy) - kernmulRSymm2Dxy_args.argptr[5] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Nx) - kernmulRSymm2Dxy_args.argptr[6] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Ny) -} + kernmulRSymm2Dxy_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMx) + kernmulRSymm2Dxy_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMy) + kernmulRSymm2Dxy_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxx) + kernmulRSymm2Dxy_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKyy) + kernmulRSymm2Dxy_args.argptr[4] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxy) + kernmulRSymm2Dxy_args.argptr[5] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Nx) + kernmulRSymm2Dxy_args.argptr[6] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Ny) + } // Wrapper for kernmulRSymm2Dxy CUDA kernel, asynchronous. -func k_kernmulRSymm2Dxy_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, cfg *config) { - if Synchronous { // debug +func k_kernmulRSymm2Dxy_async ( fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("kernmulRSymm2Dxy") } @@ -52,43 +52,44 @@ func k_kernmulRSymm2Dxy_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftKxx kernmulRSymm2Dxy_args.Lock() defer kernmulRSymm2Dxy_args.Unlock() - if kernmulRSymm2Dxy_code == 0 { + if kernmulRSymm2Dxy_code == 0{ kernmulRSymm2Dxy_code = fatbinLoad(kernmulRSymm2Dxy_map, "kernmulRSymm2Dxy") } - kernmulRSymm2Dxy_args.arg_fftMx = fftMx - kernmulRSymm2Dxy_args.arg_fftMy = fftMy - kernmulRSymm2Dxy_args.arg_fftKxx = fftKxx - kernmulRSymm2Dxy_args.arg_fftKyy = fftKyy - kernmulRSymm2Dxy_args.arg_fftKxy = fftKxy - kernmulRSymm2Dxy_args.arg_Nx = Nx - kernmulRSymm2Dxy_args.arg_Ny = Ny + kernmulRSymm2Dxy_args.arg_fftMx = fftMx + kernmulRSymm2Dxy_args.arg_fftMy = fftMy + kernmulRSymm2Dxy_args.arg_fftKxx = fftKxx + kernmulRSymm2Dxy_args.arg_fftKyy = fftKyy + kernmulRSymm2Dxy_args.arg_fftKxy = fftKxy + kernmulRSymm2Dxy_args.arg_Nx = Nx + kernmulRSymm2Dxy_args.arg_Ny = Ny + args := kernmulRSymm2Dxy_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dxy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("kernmulRSymm2Dxy") } } // maps compute capability on PTX code for kernmulRSymm2Dxy kernel. -var kernmulRSymm2Dxy_map = map[int]string{0: "", - 30: kernmulRSymm2Dxy_ptx_30, - 35: kernmulRSymm2Dxy_ptx_35, - 37: kernmulRSymm2Dxy_ptx_37, - 50: kernmulRSymm2Dxy_ptx_50, - 52: kernmulRSymm2Dxy_ptx_52, - 53: kernmulRSymm2Dxy_ptx_53, - 60: kernmulRSymm2Dxy_ptx_60, - 61: kernmulRSymm2Dxy_ptx_61, - 70: kernmulRSymm2Dxy_ptx_70, - 75: kernmulRSymm2Dxy_ptx_75} +var kernmulRSymm2Dxy_map = map[int]string{ 0: "" , +30: kernmulRSymm2Dxy_ptx_30 , +35: kernmulRSymm2Dxy_ptx_35 , +37: kernmulRSymm2Dxy_ptx_37 , +50: kernmulRSymm2Dxy_ptx_50 , +52: kernmulRSymm2Dxy_ptx_52 , +53: kernmulRSymm2Dxy_ptx_53 , +60: kernmulRSymm2Dxy_ptx_60 , +61: kernmulRSymm2Dxy_ptx_61 , +70: kernmulRSymm2Dxy_ptx_70 , +75: kernmulRSymm2Dxy_ptx_75 } // kernmulRSymm2Dxy PTX code for various compute capabilities. -const ( - kernmulRSymm2Dxy_ptx_30 = ` +const( + kernmulRSymm2Dxy_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -180,7 +181,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_35 = ` + kernmulRSymm2Dxy_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -272,7 +273,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_37 = ` + kernmulRSymm2Dxy_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -364,7 +365,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_50 = ` + kernmulRSymm2Dxy_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -456,7 +457,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_52 = ` + kernmulRSymm2Dxy_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -548,7 +549,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_53 = ` + kernmulRSymm2Dxy_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -640,7 +641,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_60 = ` + kernmulRSymm2Dxy_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -732,7 +733,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_61 = ` + kernmulRSymm2Dxy_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -824,7 +825,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_70 = ` + kernmulRSymm2Dxy_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -916,7 +917,7 @@ BB0_2: ` - kernmulRSymm2Dxy_ptx_75 = ` + kernmulRSymm2Dxy_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1008,4 +1009,4 @@ BB0_2: ` -) + ) diff --git a/cuda/kernmulrsymm2dz_wrapper.go b/cuda/kernmulrsymm2dz_wrapper.go index 54eefa5e1..5868bff20 100644 --- a/cuda/kernmulrsymm2dz_wrapper.go +++ b/cuda/kernmulrsymm2dz_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for kernmulRSymm2Dz kernel var kernmulRSymm2Dz_code cu.Function // Stores the arguments for kernmulRSymm2Dz kernel invocation -type kernmulRSymm2Dz_args_t struct { - arg_fftMz unsafe.Pointer - arg_fftKzz unsafe.Pointer - arg_Nx int - arg_Ny int - argptr [4]unsafe.Pointer +type kernmulRSymm2Dz_args_t struct{ + arg_fftMz unsafe.Pointer + arg_fftKzz unsafe.Pointer + arg_Nx int + arg_Ny int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dz kernel invocation var kernmulRSymm2Dz_args kernmulRSymm2Dz_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - kernmulRSymm2Dz_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftMz) - kernmulRSymm2Dz_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftKzz) - kernmulRSymm2Dz_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Nx) - kernmulRSymm2Dz_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Ny) -} + kernmulRSymm2Dz_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftMz) + kernmulRSymm2Dz_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftKzz) + kernmulRSymm2Dz_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Nx) + kernmulRSymm2Dz_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Ny) + } // Wrapper for kernmulRSymm2Dz CUDA kernel, asynchronous. -func k_kernmulRSymm2Dz_async(fftMz unsafe.Pointer, fftKzz unsafe.Pointer, Nx int, Ny int, cfg *config) { - if Synchronous { // debug +func k_kernmulRSymm2Dz_async ( fftMz unsafe.Pointer, fftKzz unsafe.Pointer, Nx int, Ny int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("kernmulRSymm2Dz") } @@ -46,40 +46,41 @@ func k_kernmulRSymm2Dz_async(fftMz unsafe.Pointer, fftKzz unsafe.Pointer, Nx int kernmulRSymm2Dz_args.Lock() defer kernmulRSymm2Dz_args.Unlock() - if kernmulRSymm2Dz_code == 0 { + if kernmulRSymm2Dz_code == 0{ kernmulRSymm2Dz_code = fatbinLoad(kernmulRSymm2Dz_map, "kernmulRSymm2Dz") } - kernmulRSymm2Dz_args.arg_fftMz = fftMz - kernmulRSymm2Dz_args.arg_fftKzz = fftKzz - kernmulRSymm2Dz_args.arg_Nx = Nx - kernmulRSymm2Dz_args.arg_Ny = Ny + kernmulRSymm2Dz_args.arg_fftMz = fftMz + kernmulRSymm2Dz_args.arg_fftKzz = fftKzz + kernmulRSymm2Dz_args.arg_Nx = Nx + kernmulRSymm2Dz_args.arg_Ny = Ny + args := kernmulRSymm2Dz_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("kernmulRSymm2Dz") } } // maps compute capability on PTX code for kernmulRSymm2Dz kernel. -var kernmulRSymm2Dz_map = map[int]string{0: "", - 30: kernmulRSymm2Dz_ptx_30, - 35: kernmulRSymm2Dz_ptx_35, - 37: kernmulRSymm2Dz_ptx_37, - 50: kernmulRSymm2Dz_ptx_50, - 52: kernmulRSymm2Dz_ptx_52, - 53: kernmulRSymm2Dz_ptx_53, - 60: kernmulRSymm2Dz_ptx_60, - 61: kernmulRSymm2Dz_ptx_61, - 70: kernmulRSymm2Dz_ptx_70, - 75: kernmulRSymm2Dz_ptx_75} +var kernmulRSymm2Dz_map = map[int]string{ 0: "" , +30: kernmulRSymm2Dz_ptx_30 , +35: kernmulRSymm2Dz_ptx_35 , +37: kernmulRSymm2Dz_ptx_37 , +50: kernmulRSymm2Dz_ptx_50 , +52: kernmulRSymm2Dz_ptx_52 , +53: kernmulRSymm2Dz_ptx_53 , +60: kernmulRSymm2Dz_ptx_60 , +61: kernmulRSymm2Dz_ptx_61 , +70: kernmulRSymm2Dz_ptx_70 , +75: kernmulRSymm2Dz_ptx_75 } // kernmulRSymm2Dz PTX code for various compute capabilities. -const ( - kernmulRSymm2Dz_ptx_30 = ` +const( + kernmulRSymm2Dz_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -145,7 +146,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_35 = ` + kernmulRSymm2Dz_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -211,7 +212,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_37 = ` + kernmulRSymm2Dz_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -277,7 +278,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_50 = ` + kernmulRSymm2Dz_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -343,7 +344,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_52 = ` + kernmulRSymm2Dz_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -409,7 +410,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_53 = ` + kernmulRSymm2Dz_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -475,7 +476,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_60 = ` + kernmulRSymm2Dz_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -541,7 +542,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_61 = ` + kernmulRSymm2Dz_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -607,7 +608,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_70 = ` + kernmulRSymm2Dz_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -673,7 +674,7 @@ BB0_2: ` - kernmulRSymm2Dz_ptx_75 = ` + kernmulRSymm2Dz_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -739,4 +740,4 @@ BB0_2: ` -) + ) diff --git a/cuda/kernmulrsymm3d_wrapper.go b/cuda/kernmulrsymm3d_wrapper.go index e39f66683..d63f6f916 100644 --- a/cuda/kernmulrsymm3d_wrapper.go +++ b/cuda/kernmulrsymm3d_wrapper.go @@ -5,56 +5,56 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for kernmulRSymm3D kernel var kernmulRSymm3D_code cu.Function // Stores the arguments for kernmulRSymm3D kernel invocation -type kernmulRSymm3D_args_t struct { - arg_fftMx unsafe.Pointer - arg_fftMy unsafe.Pointer - arg_fftMz unsafe.Pointer - arg_fftKxx unsafe.Pointer - arg_fftKyy unsafe.Pointer - arg_fftKzz unsafe.Pointer - arg_fftKyz unsafe.Pointer - arg_fftKxz unsafe.Pointer - arg_fftKxy unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - argptr [12]unsafe.Pointer +type kernmulRSymm3D_args_t struct{ + arg_fftMx unsafe.Pointer + arg_fftMy unsafe.Pointer + arg_fftMz unsafe.Pointer + arg_fftKxx unsafe.Pointer + arg_fftKyy unsafe.Pointer + arg_fftKzz unsafe.Pointer + arg_fftKyz unsafe.Pointer + arg_fftKxz unsafe.Pointer + arg_fftKxy unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm3D kernel invocation var kernmulRSymm3D_args kernmulRSymm3D_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - kernmulRSymm3D_args.argptr[0] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMx) - kernmulRSymm3D_args.argptr[1] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMy) - kernmulRSymm3D_args.argptr[2] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMz) - kernmulRSymm3D_args.argptr[3] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxx) - kernmulRSymm3D_args.argptr[4] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyy) - kernmulRSymm3D_args.argptr[5] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKzz) - kernmulRSymm3D_args.argptr[6] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyz) - kernmulRSymm3D_args.argptr[7] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxz) - kernmulRSymm3D_args.argptr[8] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxy) - kernmulRSymm3D_args.argptr[9] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nx) - kernmulRSymm3D_args.argptr[10] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Ny) - kernmulRSymm3D_args.argptr[11] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nz) -} + kernmulRSymm3D_args.argptr[0] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMx) + kernmulRSymm3D_args.argptr[1] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMy) + kernmulRSymm3D_args.argptr[2] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMz) + kernmulRSymm3D_args.argptr[3] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxx) + kernmulRSymm3D_args.argptr[4] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyy) + kernmulRSymm3D_args.argptr[5] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKzz) + kernmulRSymm3D_args.argptr[6] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyz) + kernmulRSymm3D_args.argptr[7] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxz) + kernmulRSymm3D_args.argptr[8] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxy) + kernmulRSymm3D_args.argptr[9] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nx) + kernmulRSymm3D_args.argptr[10] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Ny) + kernmulRSymm3D_args.argptr[11] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nz) + } // Wrapper for kernmulRSymm3D CUDA kernel, asynchronous. -func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { - if Synchronous { // debug +func k_kernmulRSymm3D_async ( fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("kernmulRSymm3D") } @@ -62,48 +62,49 @@ func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz un kernmulRSymm3D_args.Lock() defer kernmulRSymm3D_args.Unlock() - if kernmulRSymm3D_code == 0 { + if kernmulRSymm3D_code == 0{ kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D") } - kernmulRSymm3D_args.arg_fftMx = fftMx - kernmulRSymm3D_args.arg_fftMy = fftMy - kernmulRSymm3D_args.arg_fftMz = fftMz - kernmulRSymm3D_args.arg_fftKxx = fftKxx - kernmulRSymm3D_args.arg_fftKyy = fftKyy - kernmulRSymm3D_args.arg_fftKzz = fftKzz - kernmulRSymm3D_args.arg_fftKyz = fftKyz - kernmulRSymm3D_args.arg_fftKxz = fftKxz - kernmulRSymm3D_args.arg_fftKxy = fftKxy - kernmulRSymm3D_args.arg_Nx = Nx - kernmulRSymm3D_args.arg_Ny = Ny - kernmulRSymm3D_args.arg_Nz = Nz + kernmulRSymm3D_args.arg_fftMx = fftMx + kernmulRSymm3D_args.arg_fftMy = fftMy + kernmulRSymm3D_args.arg_fftMz = fftMz + kernmulRSymm3D_args.arg_fftKxx = fftKxx + kernmulRSymm3D_args.arg_fftKyy = fftKyy + kernmulRSymm3D_args.arg_fftKzz = fftKzz + kernmulRSymm3D_args.arg_fftKyz = fftKyz + kernmulRSymm3D_args.arg_fftKxz = fftKxz + kernmulRSymm3D_args.arg_fftKxy = fftKxy + kernmulRSymm3D_args.arg_Nx = Nx + kernmulRSymm3D_args.arg_Ny = Ny + kernmulRSymm3D_args.arg_Nz = Nz + args := kernmulRSymm3D_args.argptr[:] cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("kernmulRSymm3D") } } // maps compute capability on PTX code for kernmulRSymm3D kernel. -var kernmulRSymm3D_map = map[int]string{0: "", - 30: kernmulRSymm3D_ptx_30, - 35: kernmulRSymm3D_ptx_35, - 37: kernmulRSymm3D_ptx_37, - 50: kernmulRSymm3D_ptx_50, - 52: kernmulRSymm3D_ptx_52, - 53: kernmulRSymm3D_ptx_53, - 60: kernmulRSymm3D_ptx_60, - 61: kernmulRSymm3D_ptx_61, - 70: kernmulRSymm3D_ptx_70, - 75: kernmulRSymm3D_ptx_75} +var kernmulRSymm3D_map = map[int]string{ 0: "" , +30: kernmulRSymm3D_ptx_30 , +35: kernmulRSymm3D_ptx_35 , +37: kernmulRSymm3D_ptx_37 , +50: kernmulRSymm3D_ptx_50 , +52: kernmulRSymm3D_ptx_52 , +53: kernmulRSymm3D_ptx_53 , +60: kernmulRSymm3D_ptx_60 , +61: kernmulRSymm3D_ptx_61 , +70: kernmulRSymm3D_ptx_70 , +75: kernmulRSymm3D_ptx_75 } // kernmulRSymm3D PTX code for various compute capabilities. -const ( - kernmulRSymm3D_ptx_30 = ` +const( + kernmulRSymm3D_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -250,7 +251,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_35 = ` + kernmulRSymm3D_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -397,7 +398,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_37 = ` + kernmulRSymm3D_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -544,7 +545,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_50 = ` + kernmulRSymm3D_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -691,7 +692,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_52 = ` + kernmulRSymm3D_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -838,7 +839,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_53 = ` + kernmulRSymm3D_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -985,7 +986,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_60 = ` + kernmulRSymm3D_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1132,7 +1133,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_61 = ` + kernmulRSymm3D_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1279,7 +1280,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_70 = ` + kernmulRSymm3D_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1426,7 +1427,7 @@ BB0_2: ` - kernmulRSymm3D_ptx_75 = ` + kernmulRSymm3D_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1573,4 +1574,4 @@ BB0_2: ` -) + ) diff --git a/cuda/llnoprecess_wrapper.go b/cuda/llnoprecess_wrapper.go index 786d44bd7..915fd5ffb 100644 --- a/cuda/llnoprecess_wrapper.go +++ b/cuda/llnoprecess_wrapper.go @@ -5,52 +5,52 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for llnoprecess kernel var llnoprecess_code cu.Function // Stores the arguments for llnoprecess kernel invocation -type llnoprecess_args_t struct { - arg_tx unsafe.Pointer - arg_ty unsafe.Pointer - arg_tz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_hx unsafe.Pointer - arg_hy unsafe.Pointer - arg_hz unsafe.Pointer - arg_N int - argptr [10]unsafe.Pointer +type llnoprecess_args_t struct{ + arg_tx unsafe.Pointer + arg_ty unsafe.Pointer + arg_tz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_hx unsafe.Pointer + arg_hy unsafe.Pointer + arg_hz unsafe.Pointer + arg_N int + argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for llnoprecess kernel invocation var llnoprecess_args llnoprecess_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - llnoprecess_args.argptr[0] = unsafe.Pointer(&llnoprecess_args.arg_tx) - llnoprecess_args.argptr[1] = unsafe.Pointer(&llnoprecess_args.arg_ty) - llnoprecess_args.argptr[2] = unsafe.Pointer(&llnoprecess_args.arg_tz) - llnoprecess_args.argptr[3] = unsafe.Pointer(&llnoprecess_args.arg_mx) - llnoprecess_args.argptr[4] = unsafe.Pointer(&llnoprecess_args.arg_my) - llnoprecess_args.argptr[5] = unsafe.Pointer(&llnoprecess_args.arg_mz) - llnoprecess_args.argptr[6] = unsafe.Pointer(&llnoprecess_args.arg_hx) - llnoprecess_args.argptr[7] = unsafe.Pointer(&llnoprecess_args.arg_hy) - llnoprecess_args.argptr[8] = unsafe.Pointer(&llnoprecess_args.arg_hz) - llnoprecess_args.argptr[9] = unsafe.Pointer(&llnoprecess_args.arg_N) -} + llnoprecess_args.argptr[0] = unsafe.Pointer(&llnoprecess_args.arg_tx) + llnoprecess_args.argptr[1] = unsafe.Pointer(&llnoprecess_args.arg_ty) + llnoprecess_args.argptr[2] = unsafe.Pointer(&llnoprecess_args.arg_tz) + llnoprecess_args.argptr[3] = unsafe.Pointer(&llnoprecess_args.arg_mx) + llnoprecess_args.argptr[4] = unsafe.Pointer(&llnoprecess_args.arg_my) + llnoprecess_args.argptr[5] = unsafe.Pointer(&llnoprecess_args.arg_mz) + llnoprecess_args.argptr[6] = unsafe.Pointer(&llnoprecess_args.arg_hx) + llnoprecess_args.argptr[7] = unsafe.Pointer(&llnoprecess_args.arg_hy) + llnoprecess_args.argptr[8] = unsafe.Pointer(&llnoprecess_args.arg_hz) + llnoprecess_args.argptr[9] = unsafe.Pointer(&llnoprecess_args.arg_N) + } // Wrapper for llnoprecess CUDA kernel, asynchronous. -func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_llnoprecess_async ( tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("llnoprecess") } @@ -58,46 +58,47 @@ func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer llnoprecess_args.Lock() defer llnoprecess_args.Unlock() - if llnoprecess_code == 0 { + if llnoprecess_code == 0{ llnoprecess_code = fatbinLoad(llnoprecess_map, "llnoprecess") } - llnoprecess_args.arg_tx = tx - llnoprecess_args.arg_ty = ty - llnoprecess_args.arg_tz = tz - llnoprecess_args.arg_mx = mx - llnoprecess_args.arg_my = my - llnoprecess_args.arg_mz = mz - llnoprecess_args.arg_hx = hx - llnoprecess_args.arg_hy = hy - llnoprecess_args.arg_hz = hz - llnoprecess_args.arg_N = N + llnoprecess_args.arg_tx = tx + llnoprecess_args.arg_ty = ty + llnoprecess_args.arg_tz = tz + llnoprecess_args.arg_mx = mx + llnoprecess_args.arg_my = my + llnoprecess_args.arg_mz = mz + llnoprecess_args.arg_hx = hx + llnoprecess_args.arg_hy = hy + llnoprecess_args.arg_hz = hz + llnoprecess_args.arg_N = N + args := llnoprecess_args.argptr[:] cu.LaunchKernel(llnoprecess_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("llnoprecess") } } // maps compute capability on PTX code for llnoprecess kernel. -var llnoprecess_map = map[int]string{0: "", - 30: llnoprecess_ptx_30, - 35: llnoprecess_ptx_35, - 37: llnoprecess_ptx_37, - 50: llnoprecess_ptx_50, - 52: llnoprecess_ptx_52, - 53: llnoprecess_ptx_53, - 60: llnoprecess_ptx_60, - 61: llnoprecess_ptx_61, - 70: llnoprecess_ptx_70, - 75: llnoprecess_ptx_75} +var llnoprecess_map = map[int]string{ 0: "" , +30: llnoprecess_ptx_30 , +35: llnoprecess_ptx_35 , +37: llnoprecess_ptx_37 , +50: llnoprecess_ptx_50 , +52: llnoprecess_ptx_52 , +53: llnoprecess_ptx_53 , +60: llnoprecess_ptx_60 , +61: llnoprecess_ptx_61 , +70: llnoprecess_ptx_70 , +75: llnoprecess_ptx_75 } // llnoprecess PTX code for various compute capabilities. -const ( - llnoprecess_ptx_30 = ` +const( + llnoprecess_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -199,7 +200,7 @@ BB0_2: ` - llnoprecess_ptx_35 = ` + llnoprecess_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -301,7 +302,7 @@ BB0_2: ` - llnoprecess_ptx_37 = ` + llnoprecess_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -403,7 +404,7 @@ BB0_2: ` - llnoprecess_ptx_50 = ` + llnoprecess_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -505,7 +506,7 @@ BB0_2: ` - llnoprecess_ptx_52 = ` + llnoprecess_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -607,7 +608,7 @@ BB0_2: ` - llnoprecess_ptx_53 = ` + llnoprecess_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -709,7 +710,7 @@ BB0_2: ` - llnoprecess_ptx_60 = ` + llnoprecess_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -811,7 +812,7 @@ BB0_2: ` - llnoprecess_ptx_61 = ` + llnoprecess_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -913,7 +914,7 @@ BB0_2: ` - llnoprecess_ptx_70 = ` + llnoprecess_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1015,7 +1016,7 @@ BB0_2: ` - llnoprecess_ptx_75 = ` + llnoprecess_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1117,4 +1118,4 @@ BB0_2: ` -) + ) diff --git a/cuda/lltorque2_wrapper.go b/cuda/lltorque2_wrapper.go index 099482417..a88fc1738 100644 --- a/cuda/lltorque2_wrapper.go +++ b/cuda/lltorque2_wrapper.go @@ -5,56 +5,56 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for lltorque2 kernel var lltorque2_code cu.Function // Stores the arguments for lltorque2 kernel invocation -type lltorque2_args_t struct { - arg_tx unsafe.Pointer - arg_ty unsafe.Pointer - arg_tz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_hx unsafe.Pointer - arg_hy unsafe.Pointer - arg_hz unsafe.Pointer - arg_alpha_ unsafe.Pointer - arg_alpha_mul float32 - arg_N int - argptr [12]unsafe.Pointer +type lltorque2_args_t struct{ + arg_tx unsafe.Pointer + arg_ty unsafe.Pointer + arg_tz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_hx unsafe.Pointer + arg_hy unsafe.Pointer + arg_hz unsafe.Pointer + arg_alpha_ unsafe.Pointer + arg_alpha_mul float32 + arg_N int + argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for lltorque2 kernel invocation var lltorque2_args lltorque2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - lltorque2_args.argptr[0] = unsafe.Pointer(&lltorque2_args.arg_tx) - lltorque2_args.argptr[1] = unsafe.Pointer(&lltorque2_args.arg_ty) - lltorque2_args.argptr[2] = unsafe.Pointer(&lltorque2_args.arg_tz) - lltorque2_args.argptr[3] = unsafe.Pointer(&lltorque2_args.arg_mx) - lltorque2_args.argptr[4] = unsafe.Pointer(&lltorque2_args.arg_my) - lltorque2_args.argptr[5] = unsafe.Pointer(&lltorque2_args.arg_mz) - lltorque2_args.argptr[6] = unsafe.Pointer(&lltorque2_args.arg_hx) - lltorque2_args.argptr[7] = unsafe.Pointer(&lltorque2_args.arg_hy) - lltorque2_args.argptr[8] = unsafe.Pointer(&lltorque2_args.arg_hz) - lltorque2_args.argptr[9] = unsafe.Pointer(&lltorque2_args.arg_alpha_) - lltorque2_args.argptr[10] = unsafe.Pointer(&lltorque2_args.arg_alpha_mul) - lltorque2_args.argptr[11] = unsafe.Pointer(&lltorque2_args.arg_N) -} + lltorque2_args.argptr[0] = unsafe.Pointer(&lltorque2_args.arg_tx) + lltorque2_args.argptr[1] = unsafe.Pointer(&lltorque2_args.arg_ty) + lltorque2_args.argptr[2] = unsafe.Pointer(&lltorque2_args.arg_tz) + lltorque2_args.argptr[3] = unsafe.Pointer(&lltorque2_args.arg_mx) + lltorque2_args.argptr[4] = unsafe.Pointer(&lltorque2_args.arg_my) + lltorque2_args.argptr[5] = unsafe.Pointer(&lltorque2_args.arg_mz) + lltorque2_args.argptr[6] = unsafe.Pointer(&lltorque2_args.arg_hx) + lltorque2_args.argptr[7] = unsafe.Pointer(&lltorque2_args.arg_hy) + lltorque2_args.argptr[8] = unsafe.Pointer(&lltorque2_args.arg_hz) + lltorque2_args.argptr[9] = unsafe.Pointer(&lltorque2_args.arg_alpha_) + lltorque2_args.argptr[10] = unsafe.Pointer(&lltorque2_args.arg_alpha_mul) + lltorque2_args.argptr[11] = unsafe.Pointer(&lltorque2_args.arg_N) + } // Wrapper for lltorque2 CUDA kernel, asynchronous. -func k_lltorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_lltorque2_async ( tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("lltorque2") } @@ -62,48 +62,49 @@ func k_lltorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, lltorque2_args.Lock() defer lltorque2_args.Unlock() - if lltorque2_code == 0 { + if lltorque2_code == 0{ lltorque2_code = fatbinLoad(lltorque2_map, "lltorque2") } - lltorque2_args.arg_tx = tx - lltorque2_args.arg_ty = ty - lltorque2_args.arg_tz = tz - lltorque2_args.arg_mx = mx - lltorque2_args.arg_my = my - lltorque2_args.arg_mz = mz - lltorque2_args.arg_hx = hx - lltorque2_args.arg_hy = hy - lltorque2_args.arg_hz = hz - lltorque2_args.arg_alpha_ = alpha_ - lltorque2_args.arg_alpha_mul = alpha_mul - lltorque2_args.arg_N = N + lltorque2_args.arg_tx = tx + lltorque2_args.arg_ty = ty + lltorque2_args.arg_tz = tz + lltorque2_args.arg_mx = mx + lltorque2_args.arg_my = my + lltorque2_args.arg_mz = mz + lltorque2_args.arg_hx = hx + lltorque2_args.arg_hy = hy + lltorque2_args.arg_hz = hz + lltorque2_args.arg_alpha_ = alpha_ + lltorque2_args.arg_alpha_mul = alpha_mul + lltorque2_args.arg_N = N + args := lltorque2_args.argptr[:] cu.LaunchKernel(lltorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("lltorque2") } } // maps compute capability on PTX code for lltorque2 kernel. -var lltorque2_map = map[int]string{0: "", - 30: lltorque2_ptx_30, - 35: lltorque2_ptx_35, - 37: lltorque2_ptx_37, - 50: lltorque2_ptx_50, - 52: lltorque2_ptx_52, - 53: lltorque2_ptx_53, - 60: lltorque2_ptx_60, - 61: lltorque2_ptx_61, - 70: lltorque2_ptx_70, - 75: lltorque2_ptx_75} +var lltorque2_map = map[int]string{ 0: "" , +30: lltorque2_ptx_30 , +35: lltorque2_ptx_35 , +37: lltorque2_ptx_37 , +50: lltorque2_ptx_50 , +52: lltorque2_ptx_52 , +53: lltorque2_ptx_53 , +60: lltorque2_ptx_60 , +61: lltorque2_ptx_61 , +70: lltorque2_ptx_70 , +75: lltorque2_ptx_75 } // lltorque2 PTX code for various compute capabilities. -const ( - lltorque2_ptx_30 = ` +const( + lltorque2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -224,7 +225,7 @@ BB0_4: ` - lltorque2_ptx_35 = ` + lltorque2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -345,7 +346,7 @@ BB0_4: ` - lltorque2_ptx_37 = ` + lltorque2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -466,7 +467,7 @@ BB0_4: ` - lltorque2_ptx_50 = ` + lltorque2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -587,7 +588,7 @@ BB0_4: ` - lltorque2_ptx_52 = ` + lltorque2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -708,7 +709,7 @@ BB0_4: ` - lltorque2_ptx_53 = ` + lltorque2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -829,7 +830,7 @@ BB0_4: ` - lltorque2_ptx_60 = ` + lltorque2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -950,7 +951,7 @@ BB0_4: ` - lltorque2_ptx_61 = ` + lltorque2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1071,7 +1072,7 @@ BB0_4: ` - lltorque2_ptx_70 = ` + lltorque2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1192,7 +1193,7 @@ BB0_4: ` - lltorque2_ptx_75 = ` + lltorque2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1313,4 +1314,4 @@ BB0_4: ` -) + ) diff --git a/cuda/madd2_wrapper.go b/cuda/madd2_wrapper.go index 3730665e8..3c684f849 100644 --- a/cuda/madd2_wrapper.go +++ b/cuda/madd2_wrapper.go @@ -5,44 +5,44 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for madd2 kernel var madd2_code cu.Function // Stores the arguments for madd2 kernel invocation -type madd2_args_t struct { - arg_dst unsafe.Pointer - arg_src1 unsafe.Pointer - arg_fac1 float32 - arg_src2 unsafe.Pointer - arg_fac2 float32 - arg_N int - argptr [6]unsafe.Pointer +type madd2_args_t struct{ + arg_dst unsafe.Pointer + arg_src1 unsafe.Pointer + arg_fac1 float32 + arg_src2 unsafe.Pointer + arg_fac2 float32 + arg_N int + argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for madd2 kernel invocation var madd2_args madd2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - madd2_args.argptr[0] = unsafe.Pointer(&madd2_args.arg_dst) - madd2_args.argptr[1] = unsafe.Pointer(&madd2_args.arg_src1) - madd2_args.argptr[2] = unsafe.Pointer(&madd2_args.arg_fac1) - madd2_args.argptr[3] = unsafe.Pointer(&madd2_args.arg_src2) - madd2_args.argptr[4] = unsafe.Pointer(&madd2_args.arg_fac2) - madd2_args.argptr[5] = unsafe.Pointer(&madd2_args.arg_N) -} + madd2_args.argptr[0] = unsafe.Pointer(&madd2_args.arg_dst) + madd2_args.argptr[1] = unsafe.Pointer(&madd2_args.arg_src1) + madd2_args.argptr[2] = unsafe.Pointer(&madd2_args.arg_fac1) + madd2_args.argptr[3] = unsafe.Pointer(&madd2_args.arg_src2) + madd2_args.argptr[4] = unsafe.Pointer(&madd2_args.arg_fac2) + madd2_args.argptr[5] = unsafe.Pointer(&madd2_args.arg_N) + } // Wrapper for madd2 CUDA kernel, asynchronous. -func k_madd2_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, N int, cfg *config) { - if Synchronous { // debug +func k_madd2_async ( dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("madd2") } @@ -50,42 +50,43 @@ func k_madd2_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 u madd2_args.Lock() defer madd2_args.Unlock() - if madd2_code == 0 { + if madd2_code == 0{ madd2_code = fatbinLoad(madd2_map, "madd2") } - madd2_args.arg_dst = dst - madd2_args.arg_src1 = src1 - madd2_args.arg_fac1 = fac1 - madd2_args.arg_src2 = src2 - madd2_args.arg_fac2 = fac2 - madd2_args.arg_N = N + madd2_args.arg_dst = dst + madd2_args.arg_src1 = src1 + madd2_args.arg_fac1 = fac1 + madd2_args.arg_src2 = src2 + madd2_args.arg_fac2 = fac2 + madd2_args.arg_N = N + args := madd2_args.argptr[:] cu.LaunchKernel(madd2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("madd2") } } // maps compute capability on PTX code for madd2 kernel. -var madd2_map = map[int]string{0: "", - 30: madd2_ptx_30, - 35: madd2_ptx_35, - 37: madd2_ptx_37, - 50: madd2_ptx_50, - 52: madd2_ptx_52, - 53: madd2_ptx_53, - 60: madd2_ptx_60, - 61: madd2_ptx_61, - 70: madd2_ptx_70, - 75: madd2_ptx_75} +var madd2_map = map[int]string{ 0: "" , +30: madd2_ptx_30 , +35: madd2_ptx_35 , +37: madd2_ptx_37 , +50: madd2_ptx_50 , +52: madd2_ptx_52 , +53: madd2_ptx_53 , +60: madd2_ptx_60 , +61: madd2_ptx_61 , +70: madd2_ptx_70 , +75: madd2_ptx_75 } // madd2 PTX code for various compute capabilities. -const ( - madd2_ptx_30 = ` +const( + madd2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -142,7 +143,7 @@ BB0_2: ` - madd2_ptx_35 = ` + madd2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -199,7 +200,7 @@ BB0_2: ` - madd2_ptx_37 = ` + madd2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -256,7 +257,7 @@ BB0_2: ` - madd2_ptx_50 = ` + madd2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -313,7 +314,7 @@ BB0_2: ` - madd2_ptx_52 = ` + madd2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -370,7 +371,7 @@ BB0_2: ` - madd2_ptx_53 = ` + madd2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -427,7 +428,7 @@ BB0_2: ` - madd2_ptx_60 = ` + madd2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -484,7 +485,7 @@ BB0_2: ` - madd2_ptx_61 = ` + madd2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -541,7 +542,7 @@ BB0_2: ` - madd2_ptx_70 = ` + madd2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -598,7 +599,7 @@ BB0_2: ` - madd2_ptx_75 = ` + madd2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -655,4 +656,4 @@ BB0_2: ` -) + ) diff --git a/cuda/madd3_wrapper.go b/cuda/madd3_wrapper.go index 1f9ca91a2..659ffa2d5 100644 --- a/cuda/madd3_wrapper.go +++ b/cuda/madd3_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for madd3 kernel var madd3_code cu.Function // Stores the arguments for madd3 kernel invocation -type madd3_args_t struct { - arg_dst unsafe.Pointer - arg_src1 unsafe.Pointer - arg_fac1 float32 - arg_src2 unsafe.Pointer - arg_fac2 float32 - arg_src3 unsafe.Pointer - arg_fac3 float32 - arg_N int - argptr [8]unsafe.Pointer +type madd3_args_t struct{ + arg_dst unsafe.Pointer + arg_src1 unsafe.Pointer + arg_fac1 float32 + arg_src2 unsafe.Pointer + arg_fac2 float32 + arg_src3 unsafe.Pointer + arg_fac3 float32 + arg_N int + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for madd3 kernel invocation var madd3_args madd3_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - madd3_args.argptr[0] = unsafe.Pointer(&madd3_args.arg_dst) - madd3_args.argptr[1] = unsafe.Pointer(&madd3_args.arg_src1) - madd3_args.argptr[2] = unsafe.Pointer(&madd3_args.arg_fac1) - madd3_args.argptr[3] = unsafe.Pointer(&madd3_args.arg_src2) - madd3_args.argptr[4] = unsafe.Pointer(&madd3_args.arg_fac2) - madd3_args.argptr[5] = unsafe.Pointer(&madd3_args.arg_src3) - madd3_args.argptr[6] = unsafe.Pointer(&madd3_args.arg_fac3) - madd3_args.argptr[7] = unsafe.Pointer(&madd3_args.arg_N) -} + madd3_args.argptr[0] = unsafe.Pointer(&madd3_args.arg_dst) + madd3_args.argptr[1] = unsafe.Pointer(&madd3_args.arg_src1) + madd3_args.argptr[2] = unsafe.Pointer(&madd3_args.arg_fac1) + madd3_args.argptr[3] = unsafe.Pointer(&madd3_args.arg_src2) + madd3_args.argptr[4] = unsafe.Pointer(&madd3_args.arg_fac2) + madd3_args.argptr[5] = unsafe.Pointer(&madd3_args.arg_src3) + madd3_args.argptr[6] = unsafe.Pointer(&madd3_args.arg_fac3) + madd3_args.argptr[7] = unsafe.Pointer(&madd3_args.arg_N) + } // Wrapper for madd3 CUDA kernel, asynchronous. -func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) { - if Synchronous { // debug +func k_madd3_async ( dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("madd3") } @@ -54,44 +54,45 @@ func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 u madd3_args.Lock() defer madd3_args.Unlock() - if madd3_code == 0 { + if madd3_code == 0{ madd3_code = fatbinLoad(madd3_map, "madd3") } - madd3_args.arg_dst = dst - madd3_args.arg_src1 = src1 - madd3_args.arg_fac1 = fac1 - madd3_args.arg_src2 = src2 - madd3_args.arg_fac2 = fac2 - madd3_args.arg_src3 = src3 - madd3_args.arg_fac3 = fac3 - madd3_args.arg_N = N + madd3_args.arg_dst = dst + madd3_args.arg_src1 = src1 + madd3_args.arg_fac1 = fac1 + madd3_args.arg_src2 = src2 + madd3_args.arg_fac2 = fac2 + madd3_args.arg_src3 = src3 + madd3_args.arg_fac3 = fac3 + madd3_args.arg_N = N + args := madd3_args.argptr[:] cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("madd3") } } // maps compute capability on PTX code for madd3 kernel. -var madd3_map = map[int]string{0: "", - 30: madd3_ptx_30, - 35: madd3_ptx_35, - 37: madd3_ptx_37, - 50: madd3_ptx_50, - 52: madd3_ptx_52, - 53: madd3_ptx_53, - 60: madd3_ptx_60, - 61: madd3_ptx_61, - 70: madd3_ptx_70, - 75: madd3_ptx_75} +var madd3_map = map[int]string{ 0: "" , +30: madd3_ptx_30 , +35: madd3_ptx_35 , +37: madd3_ptx_37 , +50: madd3_ptx_50 , +52: madd3_ptx_52 , +53: madd3_ptx_53 , +60: madd3_ptx_60 , +61: madd3_ptx_61 , +70: madd3_ptx_70 , +75: madd3_ptx_75 } // madd3 PTX code for various compute capabilities. -const ( - madd3_ptx_30 = ` +const( + madd3_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -156,7 +157,7 @@ BB0_2: ` - madd3_ptx_35 = ` + madd3_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -221,7 +222,7 @@ BB0_2: ` - madd3_ptx_37 = ` + madd3_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -286,7 +287,7 @@ BB0_2: ` - madd3_ptx_50 = ` + madd3_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -351,7 +352,7 @@ BB0_2: ` - madd3_ptx_52 = ` + madd3_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -416,7 +417,7 @@ BB0_2: ` - madd3_ptx_53 = ` + madd3_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -481,7 +482,7 @@ BB0_2: ` - madd3_ptx_60 = ` + madd3_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -546,7 +547,7 @@ BB0_2: ` - madd3_ptx_61 = ` + madd3_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -611,7 +612,7 @@ BB0_2: ` - madd3_ptx_70 = ` + madd3_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -676,7 +677,7 @@ BB0_2: ` - madd3_ptx_75 = ` + madd3_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -741,4 +742,4 @@ BB0_2: ` -) + ) diff --git a/cuda/magnetoelasticfield_wrapper.go b/cuda/magnetoelasticfield_wrapper.go index 4686de41d..fd68a637a 100644 --- a/cuda/magnetoelasticfield_wrapper.go +++ b/cuda/magnetoelasticfield_wrapper.go @@ -5,82 +5,82 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for addmagnetoelasticfield kernel var addmagnetoelasticfield_code cu.Function // Stores the arguments for addmagnetoelasticfield kernel invocation -type addmagnetoelasticfield_args_t struct { - arg_Bx unsafe.Pointer - arg_By unsafe.Pointer - arg_Bz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_exx_ unsafe.Pointer - arg_exx_mul float32 - arg_eyy_ unsafe.Pointer - arg_eyy_mul float32 - arg_ezz_ unsafe.Pointer - arg_ezz_mul float32 - arg_exy_ unsafe.Pointer - arg_exy_mul float32 - arg_exz_ unsafe.Pointer - arg_exz_mul float32 - arg_eyz_ unsafe.Pointer - arg_eyz_mul float32 - arg_B1_ unsafe.Pointer - arg_B1_mul float32 - arg_B2_ unsafe.Pointer - arg_B2_mul float32 - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_N int - argptr [25]unsafe.Pointer +type addmagnetoelasticfield_args_t struct{ + arg_Bx unsafe.Pointer + arg_By unsafe.Pointer + arg_Bz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_exx_ unsafe.Pointer + arg_exx_mul float32 + arg_eyy_ unsafe.Pointer + arg_eyy_mul float32 + arg_ezz_ unsafe.Pointer + arg_ezz_mul float32 + arg_exy_ unsafe.Pointer + arg_exy_mul float32 + arg_exz_ unsafe.Pointer + arg_exz_mul float32 + arg_eyz_ unsafe.Pointer + arg_eyz_mul float32 + arg_B1_ unsafe.Pointer + arg_B1_mul float32 + arg_B2_ unsafe.Pointer + arg_B2_mul float32 + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_N int + argptr [25]unsafe.Pointer sync.Mutex } // Stores the arguments for addmagnetoelasticfield kernel invocation var addmagnetoelasticfield_args addmagnetoelasticfield_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - addmagnetoelasticfield_args.argptr[0] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bx) - addmagnetoelasticfield_args.argptr[1] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_By) - addmagnetoelasticfield_args.argptr[2] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bz) - addmagnetoelasticfield_args.argptr[3] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mx) - addmagnetoelasticfield_args.argptr[4] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_my) - addmagnetoelasticfield_args.argptr[5] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mz) - addmagnetoelasticfield_args.argptr[6] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_) - addmagnetoelasticfield_args.argptr[7] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_mul) - addmagnetoelasticfield_args.argptr[8] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_) - addmagnetoelasticfield_args.argptr[9] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_mul) - addmagnetoelasticfield_args.argptr[10] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_) - addmagnetoelasticfield_args.argptr[11] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_mul) - addmagnetoelasticfield_args.argptr[12] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_) - addmagnetoelasticfield_args.argptr[13] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_mul) - addmagnetoelasticfield_args.argptr[14] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_) - addmagnetoelasticfield_args.argptr[15] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_mul) - addmagnetoelasticfield_args.argptr[16] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_) - addmagnetoelasticfield_args.argptr[17] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_mul) - addmagnetoelasticfield_args.argptr[18] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_) - addmagnetoelasticfield_args.argptr[19] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_mul) - addmagnetoelasticfield_args.argptr[20] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_) - addmagnetoelasticfield_args.argptr[21] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_mul) - addmagnetoelasticfield_args.argptr[22] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_) - addmagnetoelasticfield_args.argptr[23] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_mul) - addmagnetoelasticfield_args.argptr[24] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_N) -} + addmagnetoelasticfield_args.argptr[0] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bx) + addmagnetoelasticfield_args.argptr[1] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_By) + addmagnetoelasticfield_args.argptr[2] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bz) + addmagnetoelasticfield_args.argptr[3] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mx) + addmagnetoelasticfield_args.argptr[4] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_my) + addmagnetoelasticfield_args.argptr[5] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mz) + addmagnetoelasticfield_args.argptr[6] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_) + addmagnetoelasticfield_args.argptr[7] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_mul) + addmagnetoelasticfield_args.argptr[8] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_) + addmagnetoelasticfield_args.argptr[9] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_mul) + addmagnetoelasticfield_args.argptr[10] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_) + addmagnetoelasticfield_args.argptr[11] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_mul) + addmagnetoelasticfield_args.argptr[12] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_) + addmagnetoelasticfield_args.argptr[13] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_mul) + addmagnetoelasticfield_args.argptr[14] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_) + addmagnetoelasticfield_args.argptr[15] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_mul) + addmagnetoelasticfield_args.argptr[16] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_) + addmagnetoelasticfield_args.argptr[17] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_mul) + addmagnetoelasticfield_args.argptr[18] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_) + addmagnetoelasticfield_args.argptr[19] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_mul) + addmagnetoelasticfield_args.argptr[20] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_) + addmagnetoelasticfield_args.argptr[21] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_mul) + addmagnetoelasticfield_args.argptr[22] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_) + addmagnetoelasticfield_args.argptr[23] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_mul) + addmagnetoelasticfield_args.argptr[24] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_N) + } // Wrapper for addmagnetoelasticfield CUDA kernel, asynchronous. -func k_addmagnetoelasticfield_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, exx_ unsafe.Pointer, exx_mul float32, eyy_ unsafe.Pointer, eyy_mul float32, ezz_ unsafe.Pointer, ezz_mul float32, exy_ unsafe.Pointer, exy_mul float32, exz_ unsafe.Pointer, exz_mul float32, eyz_ unsafe.Pointer, eyz_mul float32, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, Ms_ unsafe.Pointer, Ms_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_addmagnetoelasticfield_async ( Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, exx_ unsafe.Pointer, exx_mul float32, eyy_ unsafe.Pointer, eyy_mul float32, ezz_ unsafe.Pointer, ezz_mul float32, exy_ unsafe.Pointer, exy_mul float32, exz_ unsafe.Pointer, exz_mul float32, eyz_ unsafe.Pointer, eyz_mul float32, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, Ms_ unsafe.Pointer, Ms_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("addmagnetoelasticfield") } @@ -88,61 +88,62 @@ func k_addmagnetoelasticfield_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz uns addmagnetoelasticfield_args.Lock() defer addmagnetoelasticfield_args.Unlock() - if addmagnetoelasticfield_code == 0 { + if addmagnetoelasticfield_code == 0{ addmagnetoelasticfield_code = fatbinLoad(addmagnetoelasticfield_map, "addmagnetoelasticfield") } - addmagnetoelasticfield_args.arg_Bx = Bx - addmagnetoelasticfield_args.arg_By = By - addmagnetoelasticfield_args.arg_Bz = Bz - addmagnetoelasticfield_args.arg_mx = mx - addmagnetoelasticfield_args.arg_my = my - addmagnetoelasticfield_args.arg_mz = mz - addmagnetoelasticfield_args.arg_exx_ = exx_ - addmagnetoelasticfield_args.arg_exx_mul = exx_mul - addmagnetoelasticfield_args.arg_eyy_ = eyy_ - addmagnetoelasticfield_args.arg_eyy_mul = eyy_mul - addmagnetoelasticfield_args.arg_ezz_ = ezz_ - addmagnetoelasticfield_args.arg_ezz_mul = ezz_mul - addmagnetoelasticfield_args.arg_exy_ = exy_ - addmagnetoelasticfield_args.arg_exy_mul = exy_mul - addmagnetoelasticfield_args.arg_exz_ = exz_ - addmagnetoelasticfield_args.arg_exz_mul = exz_mul - addmagnetoelasticfield_args.arg_eyz_ = eyz_ - addmagnetoelasticfield_args.arg_eyz_mul = eyz_mul - addmagnetoelasticfield_args.arg_B1_ = B1_ - addmagnetoelasticfield_args.arg_B1_mul = B1_mul - addmagnetoelasticfield_args.arg_B2_ = B2_ - addmagnetoelasticfield_args.arg_B2_mul = B2_mul - addmagnetoelasticfield_args.arg_Ms_ = Ms_ - addmagnetoelasticfield_args.arg_Ms_mul = Ms_mul - addmagnetoelasticfield_args.arg_N = N + addmagnetoelasticfield_args.arg_Bx = Bx + addmagnetoelasticfield_args.arg_By = By + addmagnetoelasticfield_args.arg_Bz = Bz + addmagnetoelasticfield_args.arg_mx = mx + addmagnetoelasticfield_args.arg_my = my + addmagnetoelasticfield_args.arg_mz = mz + addmagnetoelasticfield_args.arg_exx_ = exx_ + addmagnetoelasticfield_args.arg_exx_mul = exx_mul + addmagnetoelasticfield_args.arg_eyy_ = eyy_ + addmagnetoelasticfield_args.arg_eyy_mul = eyy_mul + addmagnetoelasticfield_args.arg_ezz_ = ezz_ + addmagnetoelasticfield_args.arg_ezz_mul = ezz_mul + addmagnetoelasticfield_args.arg_exy_ = exy_ + addmagnetoelasticfield_args.arg_exy_mul = exy_mul + addmagnetoelasticfield_args.arg_exz_ = exz_ + addmagnetoelasticfield_args.arg_exz_mul = exz_mul + addmagnetoelasticfield_args.arg_eyz_ = eyz_ + addmagnetoelasticfield_args.arg_eyz_mul = eyz_mul + addmagnetoelasticfield_args.arg_B1_ = B1_ + addmagnetoelasticfield_args.arg_B1_mul = B1_mul + addmagnetoelasticfield_args.arg_B2_ = B2_ + addmagnetoelasticfield_args.arg_B2_mul = B2_mul + addmagnetoelasticfield_args.arg_Ms_ = Ms_ + addmagnetoelasticfield_args.arg_Ms_mul = Ms_mul + addmagnetoelasticfield_args.arg_N = N + args := addmagnetoelasticfield_args.argptr[:] cu.LaunchKernel(addmagnetoelasticfield_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("addmagnetoelasticfield") } } // maps compute capability on PTX code for addmagnetoelasticfield kernel. -var addmagnetoelasticfield_map = map[int]string{0: "", - 30: addmagnetoelasticfield_ptx_30, - 35: addmagnetoelasticfield_ptx_35, - 37: addmagnetoelasticfield_ptx_37, - 50: addmagnetoelasticfield_ptx_50, - 52: addmagnetoelasticfield_ptx_52, - 53: addmagnetoelasticfield_ptx_53, - 60: addmagnetoelasticfield_ptx_60, - 61: addmagnetoelasticfield_ptx_61, - 70: addmagnetoelasticfield_ptx_70, - 75: addmagnetoelasticfield_ptx_75} +var addmagnetoelasticfield_map = map[int]string{ 0: "" , +30: addmagnetoelasticfield_ptx_30 , +35: addmagnetoelasticfield_ptx_35 , +37: addmagnetoelasticfield_ptx_37 , +50: addmagnetoelasticfield_ptx_50 , +52: addmagnetoelasticfield_ptx_52 , +53: addmagnetoelasticfield_ptx_53 , +60: addmagnetoelasticfield_ptx_60 , +61: addmagnetoelasticfield_ptx_61 , +70: addmagnetoelasticfield_ptx_70 , +75: addmagnetoelasticfield_ptx_75 } // addmagnetoelasticfield PTX code for various compute capabilities. -const ( - addmagnetoelasticfield_ptx_30 = ` +const( + addmagnetoelasticfield_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -365,7 +366,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_35 = ` + addmagnetoelasticfield_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -588,7 +589,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_37 = ` + addmagnetoelasticfield_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -811,7 +812,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_50 = ` + addmagnetoelasticfield_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1034,7 +1035,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_52 = ` + addmagnetoelasticfield_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1257,7 +1258,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_53 = ` + addmagnetoelasticfield_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -1480,7 +1481,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_60 = ` + addmagnetoelasticfield_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1703,7 +1704,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_61 = ` + addmagnetoelasticfield_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1926,7 +1927,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_70 = ` + addmagnetoelasticfield_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -2149,7 +2150,7 @@ BB0_22: ` - addmagnetoelasticfield_ptx_75 = ` + addmagnetoelasticfield_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -2372,4 +2373,4 @@ BB0_22: ` -) + ) diff --git a/cuda/magnetoelasticforce_wrapper.go b/cuda/magnetoelasticforce_wrapper.go index fbce48b57..f340117fb 100644 --- a/cuda/magnetoelasticforce_wrapper.go +++ b/cuda/magnetoelasticforce_wrapper.go @@ -5,66 +5,66 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for getmagnetoelasticforce kernel var getmagnetoelasticforce_code cu.Function // Stores the arguments for getmagnetoelasticforce kernel invocation -type getmagnetoelasticforce_args_t struct { - arg_fx unsafe.Pointer - arg_fy unsafe.Pointer - arg_fz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_B1_ unsafe.Pointer - arg_B1_mul float32 - arg_B2_ unsafe.Pointer - arg_B2_mul float32 - arg_rcsx float32 - arg_rcsy float32 - arg_rcsz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [17]unsafe.Pointer +type getmagnetoelasticforce_args_t struct{ + arg_fx unsafe.Pointer + arg_fy unsafe.Pointer + arg_fz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_B1_ unsafe.Pointer + arg_B1_mul float32 + arg_B2_ unsafe.Pointer + arg_B2_mul float32 + arg_rcsx float32 + arg_rcsy float32 + arg_rcsz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for getmagnetoelasticforce kernel invocation var getmagnetoelasticforce_args getmagnetoelasticforce_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - getmagnetoelasticforce_args.argptr[0] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fx) - getmagnetoelasticforce_args.argptr[1] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fy) - getmagnetoelasticforce_args.argptr[2] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fz) - getmagnetoelasticforce_args.argptr[3] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mx) - getmagnetoelasticforce_args.argptr[4] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_my) - getmagnetoelasticforce_args.argptr[5] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mz) - getmagnetoelasticforce_args.argptr[6] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_) - getmagnetoelasticforce_args.argptr[7] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_mul) - getmagnetoelasticforce_args.argptr[8] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_) - getmagnetoelasticforce_args.argptr[9] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_mul) - getmagnetoelasticforce_args.argptr[10] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsx) - getmagnetoelasticforce_args.argptr[11] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsy) - getmagnetoelasticforce_args.argptr[12] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsz) - getmagnetoelasticforce_args.argptr[13] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nx) - getmagnetoelasticforce_args.argptr[14] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Ny) - getmagnetoelasticforce_args.argptr[15] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nz) - getmagnetoelasticforce_args.argptr[16] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_PBC) -} + getmagnetoelasticforce_args.argptr[0] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fx) + getmagnetoelasticforce_args.argptr[1] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fy) + getmagnetoelasticforce_args.argptr[2] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fz) + getmagnetoelasticforce_args.argptr[3] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mx) + getmagnetoelasticforce_args.argptr[4] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_my) + getmagnetoelasticforce_args.argptr[5] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mz) + getmagnetoelasticforce_args.argptr[6] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_) + getmagnetoelasticforce_args.argptr[7] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_mul) + getmagnetoelasticforce_args.argptr[8] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_) + getmagnetoelasticforce_args.argptr[9] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_mul) + getmagnetoelasticforce_args.argptr[10] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsx) + getmagnetoelasticforce_args.argptr[11] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsy) + getmagnetoelasticforce_args.argptr[12] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsz) + getmagnetoelasticforce_args.argptr[13] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nx) + getmagnetoelasticforce_args.argptr[14] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Ny) + getmagnetoelasticforce_args.argptr[15] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nz) + getmagnetoelasticforce_args.argptr[16] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_PBC) + } // Wrapper for getmagnetoelasticforce CUDA kernel, asynchronous. -func k_getmagnetoelasticforce_async(fx unsafe.Pointer, fy unsafe.Pointer, fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, rcsx float32, rcsy float32, rcsz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_getmagnetoelasticforce_async ( fx unsafe.Pointer, fy unsafe.Pointer, fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, rcsx float32, rcsy float32, rcsz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("getmagnetoelasticforce") } @@ -72,53 +72,54 @@ func k_getmagnetoelasticforce_async(fx unsafe.Pointer, fy unsafe.Pointer, fz uns getmagnetoelasticforce_args.Lock() defer getmagnetoelasticforce_args.Unlock() - if getmagnetoelasticforce_code == 0 { + if getmagnetoelasticforce_code == 0{ getmagnetoelasticforce_code = fatbinLoad(getmagnetoelasticforce_map, "getmagnetoelasticforce") } - getmagnetoelasticforce_args.arg_fx = fx - getmagnetoelasticforce_args.arg_fy = fy - getmagnetoelasticforce_args.arg_fz = fz - getmagnetoelasticforce_args.arg_mx = mx - getmagnetoelasticforce_args.arg_my = my - getmagnetoelasticforce_args.arg_mz = mz - getmagnetoelasticforce_args.arg_B1_ = B1_ - getmagnetoelasticforce_args.arg_B1_mul = B1_mul - getmagnetoelasticforce_args.arg_B2_ = B2_ - getmagnetoelasticforce_args.arg_B2_mul = B2_mul - getmagnetoelasticforce_args.arg_rcsx = rcsx - getmagnetoelasticforce_args.arg_rcsy = rcsy - getmagnetoelasticforce_args.arg_rcsz = rcsz - getmagnetoelasticforce_args.arg_Nx = Nx - getmagnetoelasticforce_args.arg_Ny = Ny - getmagnetoelasticforce_args.arg_Nz = Nz - getmagnetoelasticforce_args.arg_PBC = PBC + getmagnetoelasticforce_args.arg_fx = fx + getmagnetoelasticforce_args.arg_fy = fy + getmagnetoelasticforce_args.arg_fz = fz + getmagnetoelasticforce_args.arg_mx = mx + getmagnetoelasticforce_args.arg_my = my + getmagnetoelasticforce_args.arg_mz = mz + getmagnetoelasticforce_args.arg_B1_ = B1_ + getmagnetoelasticforce_args.arg_B1_mul = B1_mul + getmagnetoelasticforce_args.arg_B2_ = B2_ + getmagnetoelasticforce_args.arg_B2_mul = B2_mul + getmagnetoelasticforce_args.arg_rcsx = rcsx + getmagnetoelasticforce_args.arg_rcsy = rcsy + getmagnetoelasticforce_args.arg_rcsz = rcsz + getmagnetoelasticforce_args.arg_Nx = Nx + getmagnetoelasticforce_args.arg_Ny = Ny + getmagnetoelasticforce_args.arg_Nz = Nz + getmagnetoelasticforce_args.arg_PBC = PBC + args := getmagnetoelasticforce_args.argptr[:] cu.LaunchKernel(getmagnetoelasticforce_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("getmagnetoelasticforce") } } // maps compute capability on PTX code for getmagnetoelasticforce kernel. -var getmagnetoelasticforce_map = map[int]string{0: "", - 30: getmagnetoelasticforce_ptx_30, - 35: getmagnetoelasticforce_ptx_35, - 37: getmagnetoelasticforce_ptx_37, - 50: getmagnetoelasticforce_ptx_50, - 52: getmagnetoelasticforce_ptx_52, - 53: getmagnetoelasticforce_ptx_53, - 60: getmagnetoelasticforce_ptx_60, - 61: getmagnetoelasticforce_ptx_61, - 70: getmagnetoelasticforce_ptx_70, - 75: getmagnetoelasticforce_ptx_75} +var getmagnetoelasticforce_map = map[int]string{ 0: "" , +30: getmagnetoelasticforce_ptx_30 , +35: getmagnetoelasticforce_ptx_35 , +37: getmagnetoelasticforce_ptx_37 , +50: getmagnetoelasticforce_ptx_50 , +52: getmagnetoelasticforce_ptx_52 , +53: getmagnetoelasticforce_ptx_53 , +60: getmagnetoelasticforce_ptx_60 , +61: getmagnetoelasticforce_ptx_61 , +70: getmagnetoelasticforce_ptx_70 , +75: getmagnetoelasticforce_ptx_75 } // getmagnetoelasticforce PTX code for various compute capabilities. -const ( - getmagnetoelasticforce_ptx_30 = ` +const( + getmagnetoelasticforce_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -1034,7 +1035,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_35 = ` + getmagnetoelasticforce_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1941,7 +1942,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_37 = ` + getmagnetoelasticforce_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -2848,7 +2849,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_50 = ` + getmagnetoelasticforce_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -3755,7 +3756,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_52 = ` + getmagnetoelasticforce_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -4662,7 +4663,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_53 = ` + getmagnetoelasticforce_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -5569,7 +5570,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_60 = ` + getmagnetoelasticforce_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -6476,7 +6477,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_61 = ` + getmagnetoelasticforce_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -7383,7 +7384,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_70 = ` + getmagnetoelasticforce_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -8290,7 +8291,7 @@ BB0_108: ` - getmagnetoelasticforce_ptx_75 = ` + getmagnetoelasticforce_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -9197,4 +9198,4 @@ BB0_108: ` -) + ) diff --git a/cuda/maxangle_wrapper.go b/cuda/maxangle_wrapper.go index a0b11a337..231de47f0 100644 --- a/cuda/maxangle_wrapper.go +++ b/cuda/maxangle_wrapper.go @@ -5,52 +5,52 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for setmaxangle kernel var setmaxangle_code cu.Function // Stores the arguments for setmaxangle kernel invocation -type setmaxangle_args_t struct { - arg_dst unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_aLUT2d unsafe.Pointer - arg_regions unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [10]unsafe.Pointer +type setmaxangle_args_t struct{ + arg_dst unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_aLUT2d unsafe.Pointer + arg_regions unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for setmaxangle kernel invocation var setmaxangle_args setmaxangle_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - setmaxangle_args.argptr[0] = unsafe.Pointer(&setmaxangle_args.arg_dst) - setmaxangle_args.argptr[1] = unsafe.Pointer(&setmaxangle_args.arg_mx) - setmaxangle_args.argptr[2] = unsafe.Pointer(&setmaxangle_args.arg_my) - setmaxangle_args.argptr[3] = unsafe.Pointer(&setmaxangle_args.arg_mz) - setmaxangle_args.argptr[4] = unsafe.Pointer(&setmaxangle_args.arg_aLUT2d) - setmaxangle_args.argptr[5] = unsafe.Pointer(&setmaxangle_args.arg_regions) - setmaxangle_args.argptr[6] = unsafe.Pointer(&setmaxangle_args.arg_Nx) - setmaxangle_args.argptr[7] = unsafe.Pointer(&setmaxangle_args.arg_Ny) - setmaxangle_args.argptr[8] = unsafe.Pointer(&setmaxangle_args.arg_Nz) - setmaxangle_args.argptr[9] = unsafe.Pointer(&setmaxangle_args.arg_PBC) -} + setmaxangle_args.argptr[0] = unsafe.Pointer(&setmaxangle_args.arg_dst) + setmaxangle_args.argptr[1] = unsafe.Pointer(&setmaxangle_args.arg_mx) + setmaxangle_args.argptr[2] = unsafe.Pointer(&setmaxangle_args.arg_my) + setmaxangle_args.argptr[3] = unsafe.Pointer(&setmaxangle_args.arg_mz) + setmaxangle_args.argptr[4] = unsafe.Pointer(&setmaxangle_args.arg_aLUT2d) + setmaxangle_args.argptr[5] = unsafe.Pointer(&setmaxangle_args.arg_regions) + setmaxangle_args.argptr[6] = unsafe.Pointer(&setmaxangle_args.arg_Nx) + setmaxangle_args.argptr[7] = unsafe.Pointer(&setmaxangle_args.arg_Ny) + setmaxangle_args.argptr[8] = unsafe.Pointer(&setmaxangle_args.arg_Nz) + setmaxangle_args.argptr[9] = unsafe.Pointer(&setmaxangle_args.arg_PBC) + } // Wrapper for setmaxangle CUDA kernel, asynchronous. -func k_setmaxangle_async(dst unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_setmaxangle_async ( dst unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("setmaxangle") } @@ -58,46 +58,47 @@ func k_setmaxangle_async(dst unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointe setmaxangle_args.Lock() defer setmaxangle_args.Unlock() - if setmaxangle_code == 0 { + if setmaxangle_code == 0{ setmaxangle_code = fatbinLoad(setmaxangle_map, "setmaxangle") } - setmaxangle_args.arg_dst = dst - setmaxangle_args.arg_mx = mx - setmaxangle_args.arg_my = my - setmaxangle_args.arg_mz = mz - setmaxangle_args.arg_aLUT2d = aLUT2d - setmaxangle_args.arg_regions = regions - setmaxangle_args.arg_Nx = Nx - setmaxangle_args.arg_Ny = Ny - setmaxangle_args.arg_Nz = Nz - setmaxangle_args.arg_PBC = PBC + setmaxangle_args.arg_dst = dst + setmaxangle_args.arg_mx = mx + setmaxangle_args.arg_my = my + setmaxangle_args.arg_mz = mz + setmaxangle_args.arg_aLUT2d = aLUT2d + setmaxangle_args.arg_regions = regions + setmaxangle_args.arg_Nx = Nx + setmaxangle_args.arg_Ny = Ny + setmaxangle_args.arg_Nz = Nz + setmaxangle_args.arg_PBC = PBC + args := setmaxangle_args.argptr[:] cu.LaunchKernel(setmaxangle_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("setmaxangle") } } // maps compute capability on PTX code for setmaxangle kernel. -var setmaxangle_map = map[int]string{0: "", - 30: setmaxangle_ptx_30, - 35: setmaxangle_ptx_35, - 37: setmaxangle_ptx_37, - 50: setmaxangle_ptx_50, - 52: setmaxangle_ptx_52, - 53: setmaxangle_ptx_53, - 60: setmaxangle_ptx_60, - 61: setmaxangle_ptx_61, - 70: setmaxangle_ptx_70, - 75: setmaxangle_ptx_75} +var setmaxangle_map = map[int]string{ 0: "" , +30: setmaxangle_ptx_30 , +35: setmaxangle_ptx_35 , +37: setmaxangle_ptx_37 , +50: setmaxangle_ptx_50 , +52: setmaxangle_ptx_52 , +53: setmaxangle_ptx_53 , +60: setmaxangle_ptx_60 , +61: setmaxangle_ptx_61 , +70: setmaxangle_ptx_70 , +75: setmaxangle_ptx_75 } // setmaxangle PTX code for various compute capabilities. -const ( - setmaxangle_ptx_30 = ` +const( + setmaxangle_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -677,7 +678,7 @@ BB0_34: ` - setmaxangle_ptx_35 = ` + setmaxangle_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1259,7 +1260,7 @@ BB0_34: ` - setmaxangle_ptx_37 = ` + setmaxangle_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1841,7 +1842,7 @@ BB0_34: ` - setmaxangle_ptx_50 = ` + setmaxangle_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -2423,7 +2424,7 @@ BB0_34: ` - setmaxangle_ptx_52 = ` + setmaxangle_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -3005,7 +3006,7 @@ BB0_34: ` - setmaxangle_ptx_53 = ` + setmaxangle_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -3587,7 +3588,7 @@ BB0_34: ` - setmaxangle_ptx_60 = ` + setmaxangle_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -4169,7 +4170,7 @@ BB0_34: ` - setmaxangle_ptx_61 = ` + setmaxangle_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -4751,7 +4752,7 @@ BB0_34: ` - setmaxangle_ptx_70 = ` + setmaxangle_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -5333,7 +5334,7 @@ BB0_34: ` - setmaxangle_ptx_75 = ` + setmaxangle_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -5915,4 +5916,4 @@ BB0_34: ` -) + ) diff --git a/cuda/minimize_wrapper.go b/cuda/minimize_wrapper.go index a861f37bf..1193e5354 100644 --- a/cuda/minimize_wrapper.go +++ b/cuda/minimize_wrapper.go @@ -5,54 +5,54 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for minimize kernel var minimize_code cu.Function // Stores the arguments for minimize kernel invocation -type minimize_args_t struct { - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_m0x unsafe.Pointer - arg_m0y unsafe.Pointer - arg_m0z unsafe.Pointer - arg_tx unsafe.Pointer - arg_ty unsafe.Pointer - arg_tz unsafe.Pointer - arg_dt float32 - arg_N int - argptr [11]unsafe.Pointer +type minimize_args_t struct{ + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_m0x unsafe.Pointer + arg_m0y unsafe.Pointer + arg_m0z unsafe.Pointer + arg_tx unsafe.Pointer + arg_ty unsafe.Pointer + arg_tz unsafe.Pointer + arg_dt float32 + arg_N int + argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for minimize kernel invocation var minimize_args minimize_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - minimize_args.argptr[0] = unsafe.Pointer(&minimize_args.arg_mx) - minimize_args.argptr[1] = unsafe.Pointer(&minimize_args.arg_my) - minimize_args.argptr[2] = unsafe.Pointer(&minimize_args.arg_mz) - minimize_args.argptr[3] = unsafe.Pointer(&minimize_args.arg_m0x) - minimize_args.argptr[4] = unsafe.Pointer(&minimize_args.arg_m0y) - minimize_args.argptr[5] = unsafe.Pointer(&minimize_args.arg_m0z) - minimize_args.argptr[6] = unsafe.Pointer(&minimize_args.arg_tx) - minimize_args.argptr[7] = unsafe.Pointer(&minimize_args.arg_ty) - minimize_args.argptr[8] = unsafe.Pointer(&minimize_args.arg_tz) - minimize_args.argptr[9] = unsafe.Pointer(&minimize_args.arg_dt) - minimize_args.argptr[10] = unsafe.Pointer(&minimize_args.arg_N) -} + minimize_args.argptr[0] = unsafe.Pointer(&minimize_args.arg_mx) + minimize_args.argptr[1] = unsafe.Pointer(&minimize_args.arg_my) + minimize_args.argptr[2] = unsafe.Pointer(&minimize_args.arg_mz) + minimize_args.argptr[3] = unsafe.Pointer(&minimize_args.arg_m0x) + minimize_args.argptr[4] = unsafe.Pointer(&minimize_args.arg_m0y) + minimize_args.argptr[5] = unsafe.Pointer(&minimize_args.arg_m0z) + minimize_args.argptr[6] = unsafe.Pointer(&minimize_args.arg_tx) + minimize_args.argptr[7] = unsafe.Pointer(&minimize_args.arg_ty) + minimize_args.argptr[8] = unsafe.Pointer(&minimize_args.arg_tz) + minimize_args.argptr[9] = unsafe.Pointer(&minimize_args.arg_dt) + minimize_args.argptr[10] = unsafe.Pointer(&minimize_args.arg_N) + } // Wrapper for minimize CUDA kernel, asynchronous. -func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) { - if Synchronous { // debug +func k_minimize_async ( mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("minimize") } @@ -60,47 +60,48 @@ func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m minimize_args.Lock() defer minimize_args.Unlock() - if minimize_code == 0 { + if minimize_code == 0{ minimize_code = fatbinLoad(minimize_map, "minimize") } - minimize_args.arg_mx = mx - minimize_args.arg_my = my - minimize_args.arg_mz = mz - minimize_args.arg_m0x = m0x - minimize_args.arg_m0y = m0y - minimize_args.arg_m0z = m0z - minimize_args.arg_tx = tx - minimize_args.arg_ty = ty - minimize_args.arg_tz = tz - minimize_args.arg_dt = dt - minimize_args.arg_N = N + minimize_args.arg_mx = mx + minimize_args.arg_my = my + minimize_args.arg_mz = mz + minimize_args.arg_m0x = m0x + minimize_args.arg_m0y = m0y + minimize_args.arg_m0z = m0z + minimize_args.arg_tx = tx + minimize_args.arg_ty = ty + minimize_args.arg_tz = tz + minimize_args.arg_dt = dt + minimize_args.arg_N = N + args := minimize_args.argptr[:] cu.LaunchKernel(minimize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("minimize") } } // maps compute capability on PTX code for minimize kernel. -var minimize_map = map[int]string{0: "", - 30: minimize_ptx_30, - 35: minimize_ptx_35, - 37: minimize_ptx_37, - 50: minimize_ptx_50, - 52: minimize_ptx_52, - 53: minimize_ptx_53, - 60: minimize_ptx_60, - 61: minimize_ptx_61, - 70: minimize_ptx_70, - 75: minimize_ptx_75} +var minimize_map = map[int]string{ 0: "" , +30: minimize_ptx_30 , +35: minimize_ptx_35 , +37: minimize_ptx_37 , +50: minimize_ptx_50 , +52: minimize_ptx_52 , +53: minimize_ptx_53 , +60: minimize_ptx_60 , +61: minimize_ptx_61 , +70: minimize_ptx_70 , +75: minimize_ptx_75 } // minimize PTX code for various compute capabilities. -const ( - minimize_ptx_30 = ` +const( + minimize_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -201,7 +202,7 @@ BB0_2: ` - minimize_ptx_35 = ` + minimize_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -302,7 +303,7 @@ BB0_2: ` - minimize_ptx_37 = ` + minimize_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -403,7 +404,7 @@ BB0_2: ` - minimize_ptx_50 = ` + minimize_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -504,7 +505,7 @@ BB0_2: ` - minimize_ptx_52 = ` + minimize_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -605,7 +606,7 @@ BB0_2: ` - minimize_ptx_53 = ` + minimize_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -706,7 +707,7 @@ BB0_2: ` - minimize_ptx_60 = ` + minimize_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -807,7 +808,7 @@ BB0_2: ` - minimize_ptx_61 = ` + minimize_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -908,7 +909,7 @@ BB0_2: ` - minimize_ptx_70 = ` + minimize_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1009,7 +1010,7 @@ BB0_2: ` - minimize_ptx_75 = ` + minimize_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1110,4 +1111,4 @@ BB0_2: ` -) + ) diff --git a/cuda/mul_wrapper.go b/cuda/mul_wrapper.go index 8ac43ad65..d1190b984 100644 --- a/cuda/mul_wrapper.go +++ b/cuda/mul_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for mul kernel var mul_code cu.Function // Stores the arguments for mul kernel invocation -type mul_args_t struct { - arg_dst unsafe.Pointer - arg_a unsafe.Pointer - arg_b unsafe.Pointer - arg_N int - argptr [4]unsafe.Pointer +type mul_args_t struct{ + arg_dst unsafe.Pointer + arg_a unsafe.Pointer + arg_b unsafe.Pointer + arg_N int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for mul kernel invocation var mul_args mul_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - mul_args.argptr[0] = unsafe.Pointer(&mul_args.arg_dst) - mul_args.argptr[1] = unsafe.Pointer(&mul_args.arg_a) - mul_args.argptr[2] = unsafe.Pointer(&mul_args.arg_b) - mul_args.argptr[3] = unsafe.Pointer(&mul_args.arg_N) -} + mul_args.argptr[0] = unsafe.Pointer(&mul_args.arg_dst) + mul_args.argptr[1] = unsafe.Pointer(&mul_args.arg_a) + mul_args.argptr[2] = unsafe.Pointer(&mul_args.arg_b) + mul_args.argptr[3] = unsafe.Pointer(&mul_args.arg_N) + } // Wrapper for mul CUDA kernel, asynchronous. -func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_mul_async ( dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("mul") } @@ -46,40 +46,41 @@ func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, mul_args.Lock() defer mul_args.Unlock() - if mul_code == 0 { + if mul_code == 0{ mul_code = fatbinLoad(mul_map, "mul") } - mul_args.arg_dst = dst - mul_args.arg_a = a - mul_args.arg_b = b - mul_args.arg_N = N + mul_args.arg_dst = dst + mul_args.arg_a = a + mul_args.arg_b = b + mul_args.arg_N = N + args := mul_args.argptr[:] cu.LaunchKernel(mul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("mul") } } // maps compute capability on PTX code for mul kernel. -var mul_map = map[int]string{0: "", - 30: mul_ptx_30, - 35: mul_ptx_35, - 37: mul_ptx_37, - 50: mul_ptx_50, - 52: mul_ptx_52, - 53: mul_ptx_53, - 60: mul_ptx_60, - 61: mul_ptx_61, - 70: mul_ptx_70, - 75: mul_ptx_75} +var mul_map = map[int]string{ 0: "" , +30: mul_ptx_30 , +35: mul_ptx_35 , +37: mul_ptx_37 , +50: mul_ptx_50 , +52: mul_ptx_52 , +53: mul_ptx_53 , +60: mul_ptx_60 , +61: mul_ptx_61 , +70: mul_ptx_70 , +75: mul_ptx_75 } // mul PTX code for various compute capabilities. -const ( - mul_ptx_30 = ` +const( + mul_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -131,7 +132,7 @@ BB0_2: ` - mul_ptx_35 = ` + mul_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -183,7 +184,7 @@ BB0_2: ` - mul_ptx_37 = ` + mul_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -235,7 +236,7 @@ BB0_2: ` - mul_ptx_50 = ` + mul_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -287,7 +288,7 @@ BB0_2: ` - mul_ptx_52 = ` + mul_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -339,7 +340,7 @@ BB0_2: ` - mul_ptx_53 = ` + mul_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -391,7 +392,7 @@ BB0_2: ` - mul_ptx_60 = ` + mul_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -443,7 +444,7 @@ BB0_2: ` - mul_ptx_61 = ` + mul_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -495,7 +496,7 @@ BB0_2: ` - mul_ptx_70 = ` + mul_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -547,7 +548,7 @@ BB0_2: ` - mul_ptx_75 = ` + mul_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -599,4 +600,4 @@ BB0_2: ` -) + ) diff --git a/cuda/normalize_wrapper.go b/cuda/normalize_wrapper.go index 4c4dcedb0..ab1e7d69f 100644 --- a/cuda/normalize_wrapper.go +++ b/cuda/normalize_wrapper.go @@ -5,42 +5,42 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for normalize kernel var normalize_code cu.Function // Stores the arguments for normalize kernel invocation -type normalize_args_t struct { - arg_vx unsafe.Pointer - arg_vy unsafe.Pointer - arg_vz unsafe.Pointer - arg_vol unsafe.Pointer - arg_N int - argptr [5]unsafe.Pointer +type normalize_args_t struct{ + arg_vx unsafe.Pointer + arg_vy unsafe.Pointer + arg_vz unsafe.Pointer + arg_vol unsafe.Pointer + arg_N int + argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for normalize kernel invocation var normalize_args normalize_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - normalize_args.argptr[0] = unsafe.Pointer(&normalize_args.arg_vx) - normalize_args.argptr[1] = unsafe.Pointer(&normalize_args.arg_vy) - normalize_args.argptr[2] = unsafe.Pointer(&normalize_args.arg_vz) - normalize_args.argptr[3] = unsafe.Pointer(&normalize_args.arg_vol) - normalize_args.argptr[4] = unsafe.Pointer(&normalize_args.arg_N) -} + normalize_args.argptr[0] = unsafe.Pointer(&normalize_args.arg_vx) + normalize_args.argptr[1] = unsafe.Pointer(&normalize_args.arg_vy) + normalize_args.argptr[2] = unsafe.Pointer(&normalize_args.arg_vz) + normalize_args.argptr[3] = unsafe.Pointer(&normalize_args.arg_vol) + normalize_args.argptr[4] = unsafe.Pointer(&normalize_args.arg_N) + } // Wrapper for normalize CUDA kernel, asynchronous. -func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_normalize_async ( vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("normalize") } @@ -48,41 +48,42 @@ func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, normalize_args.Lock() defer normalize_args.Unlock() - if normalize_code == 0 { + if normalize_code == 0{ normalize_code = fatbinLoad(normalize_map, "normalize") } - normalize_args.arg_vx = vx - normalize_args.arg_vy = vy - normalize_args.arg_vz = vz - normalize_args.arg_vol = vol - normalize_args.arg_N = N + normalize_args.arg_vx = vx + normalize_args.arg_vy = vy + normalize_args.arg_vz = vz + normalize_args.arg_vol = vol + normalize_args.arg_N = N + args := normalize_args.argptr[:] cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("normalize") } } // maps compute capability on PTX code for normalize kernel. -var normalize_map = map[int]string{0: "", - 30: normalize_ptx_30, - 35: normalize_ptx_35, - 37: normalize_ptx_37, - 50: normalize_ptx_50, - 52: normalize_ptx_52, - 53: normalize_ptx_53, - 60: normalize_ptx_60, - 61: normalize_ptx_61, - 70: normalize_ptx_70, - 75: normalize_ptx_75} +var normalize_map = map[int]string{ 0: "" , +30: normalize_ptx_30 , +35: normalize_ptx_35 , +37: normalize_ptx_37 , +50: normalize_ptx_50 , +52: normalize_ptx_52 , +53: normalize_ptx_53 , +60: normalize_ptx_60 , +61: normalize_ptx_61 , +70: normalize_ptx_70 , +75: normalize_ptx_75 } // normalize PTX code for various compute capabilities. -const ( - normalize_ptx_30 = ` +const( + normalize_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -165,7 +166,7 @@ BB0_6: ` - normalize_ptx_35 = ` + normalize_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -248,7 +249,7 @@ BB0_6: ` - normalize_ptx_37 = ` + normalize_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -331,7 +332,7 @@ BB0_6: ` - normalize_ptx_50 = ` + normalize_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -414,7 +415,7 @@ BB0_6: ` - normalize_ptx_52 = ` + normalize_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -497,7 +498,7 @@ BB0_6: ` - normalize_ptx_53 = ` + normalize_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -580,7 +581,7 @@ BB0_6: ` - normalize_ptx_60 = ` + normalize_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -663,7 +664,7 @@ BB0_6: ` - normalize_ptx_61 = ` + normalize_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -746,7 +747,7 @@ BB0_6: ` - normalize_ptx_70 = ` + normalize_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -829,7 +830,7 @@ BB0_6: ` - normalize_ptx_75 = ` + normalize_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -912,4 +913,4 @@ BB0_6: ` -) + ) diff --git a/cuda/reducedot_wrapper.go b/cuda/reducedot_wrapper.go index f7ca89bc8..6ab243361 100644 --- a/cuda/reducedot_wrapper.go +++ b/cuda/reducedot_wrapper.go @@ -5,42 +5,42 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducedot kernel var reducedot_code cu.Function // Stores the arguments for reducedot kernel invocation -type reducedot_args_t struct { - arg_x1 unsafe.Pointer - arg_x2 unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [5]unsafe.Pointer +type reducedot_args_t struct{ + arg_x1 unsafe.Pointer + arg_x2 unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducedot kernel invocation var reducedot_args reducedot_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducedot_args.argptr[0] = unsafe.Pointer(&reducedot_args.arg_x1) - reducedot_args.argptr[1] = unsafe.Pointer(&reducedot_args.arg_x2) - reducedot_args.argptr[2] = unsafe.Pointer(&reducedot_args.arg_dst) - reducedot_args.argptr[3] = unsafe.Pointer(&reducedot_args.arg_initVal) - reducedot_args.argptr[4] = unsafe.Pointer(&reducedot_args.arg_n) -} + reducedot_args.argptr[0] = unsafe.Pointer(&reducedot_args.arg_x1) + reducedot_args.argptr[1] = unsafe.Pointer(&reducedot_args.arg_x2) + reducedot_args.argptr[2] = unsafe.Pointer(&reducedot_args.arg_dst) + reducedot_args.argptr[3] = unsafe.Pointer(&reducedot_args.arg_initVal) + reducedot_args.argptr[4] = unsafe.Pointer(&reducedot_args.arg_n) + } // Wrapper for reducedot CUDA kernel, asynchronous. -func k_reducedot_async(x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducedot_async ( x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducedot") } @@ -48,41 +48,42 @@ func k_reducedot_async(x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, reducedot_args.Lock() defer reducedot_args.Unlock() - if reducedot_code == 0 { + if reducedot_code == 0{ reducedot_code = fatbinLoad(reducedot_map, "reducedot") } - reducedot_args.arg_x1 = x1 - reducedot_args.arg_x2 = x2 - reducedot_args.arg_dst = dst - reducedot_args.arg_initVal = initVal - reducedot_args.arg_n = n + reducedot_args.arg_x1 = x1 + reducedot_args.arg_x2 = x2 + reducedot_args.arg_dst = dst + reducedot_args.arg_initVal = initVal + reducedot_args.arg_n = n + args := reducedot_args.argptr[:] cu.LaunchKernel(reducedot_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducedot") } } // maps compute capability on PTX code for reducedot kernel. -var reducedot_map = map[int]string{0: "", - 30: reducedot_ptx_30, - 35: reducedot_ptx_35, - 37: reducedot_ptx_37, - 50: reducedot_ptx_50, - 52: reducedot_ptx_52, - 53: reducedot_ptx_53, - 60: reducedot_ptx_60, - 61: reducedot_ptx_61, - 70: reducedot_ptx_70, - 75: reducedot_ptx_75} +var reducedot_map = map[int]string{ 0: "" , +30: reducedot_ptx_30 , +35: reducedot_ptx_35 , +37: reducedot_ptx_37 , +50: reducedot_ptx_50 , +52: reducedot_ptx_52 , +53: reducedot_ptx_53 , +60: reducedot_ptx_60 , +61: reducedot_ptx_61 , +70: reducedot_ptx_70 , +75: reducedot_ptx_75 } // reducedot PTX code for various compute capabilities. -const ( - reducedot_ptx_30 = ` +const( + reducedot_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -202,7 +203,7 @@ BB0_10: ` - reducedot_ptx_35 = ` + reducedot_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -322,7 +323,7 @@ BB0_10: ` - reducedot_ptx_37 = ` + reducedot_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -442,7 +443,7 @@ BB0_10: ` - reducedot_ptx_50 = ` + reducedot_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -562,7 +563,7 @@ BB0_10: ` - reducedot_ptx_52 = ` + reducedot_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -682,7 +683,7 @@ BB0_10: ` - reducedot_ptx_53 = ` + reducedot_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -802,7 +803,7 @@ BB0_10: ` - reducedot_ptx_60 = ` + reducedot_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -922,7 +923,7 @@ BB0_10: ` - reducedot_ptx_61 = ` + reducedot_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1042,7 +1043,7 @@ BB0_10: ` - reducedot_ptx_70 = ` + reducedot_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1162,7 +1163,7 @@ BB0_10: ` - reducedot_ptx_75 = ` + reducedot_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1282,4 +1283,4 @@ BB0_10: ` -) + ) diff --git a/cuda/reducemaxabs_wrapper.go b/cuda/reducemaxabs_wrapper.go index 7fca796d7..14000e74e 100644 --- a/cuda/reducemaxabs_wrapper.go +++ b/cuda/reducemaxabs_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducemaxabs kernel var reducemaxabs_code cu.Function // Stores the arguments for reducemaxabs kernel invocation -type reducemaxabs_args_t struct { - arg_src unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [4]unsafe.Pointer +type reducemaxabs_args_t struct{ + arg_src unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxabs kernel invocation var reducemaxabs_args reducemaxabs_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducemaxabs_args.argptr[0] = unsafe.Pointer(&reducemaxabs_args.arg_src) - reducemaxabs_args.argptr[1] = unsafe.Pointer(&reducemaxabs_args.arg_dst) - reducemaxabs_args.argptr[2] = unsafe.Pointer(&reducemaxabs_args.arg_initVal) - reducemaxabs_args.argptr[3] = unsafe.Pointer(&reducemaxabs_args.arg_n) -} + reducemaxabs_args.argptr[0] = unsafe.Pointer(&reducemaxabs_args.arg_src) + reducemaxabs_args.argptr[1] = unsafe.Pointer(&reducemaxabs_args.arg_dst) + reducemaxabs_args.argptr[2] = unsafe.Pointer(&reducemaxabs_args.arg_initVal) + reducemaxabs_args.argptr[3] = unsafe.Pointer(&reducemaxabs_args.arg_n) + } // Wrapper for reducemaxabs CUDA kernel, asynchronous. -func k_reducemaxabs_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducemaxabs_async ( src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducemaxabs") } @@ -46,40 +46,41 @@ func k_reducemaxabs_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float3 reducemaxabs_args.Lock() defer reducemaxabs_args.Unlock() - if reducemaxabs_code == 0 { + if reducemaxabs_code == 0{ reducemaxabs_code = fatbinLoad(reducemaxabs_map, "reducemaxabs") } - reducemaxabs_args.arg_src = src - reducemaxabs_args.arg_dst = dst - reducemaxabs_args.arg_initVal = initVal - reducemaxabs_args.arg_n = n + reducemaxabs_args.arg_src = src + reducemaxabs_args.arg_dst = dst + reducemaxabs_args.arg_initVal = initVal + reducemaxabs_args.arg_n = n + args := reducemaxabs_args.argptr[:] cu.LaunchKernel(reducemaxabs_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducemaxabs") } } // maps compute capability on PTX code for reducemaxabs kernel. -var reducemaxabs_map = map[int]string{0: "", - 30: reducemaxabs_ptx_30, - 35: reducemaxabs_ptx_35, - 37: reducemaxabs_ptx_37, - 50: reducemaxabs_ptx_50, - 52: reducemaxabs_ptx_52, - 53: reducemaxabs_ptx_53, - 60: reducemaxabs_ptx_60, - 61: reducemaxabs_ptx_61, - 70: reducemaxabs_ptx_70, - 75: reducemaxabs_ptx_75} +var reducemaxabs_map = map[int]string{ 0: "" , +30: reducemaxabs_ptx_30 , +35: reducemaxabs_ptx_35 , +37: reducemaxabs_ptx_37 , +50: reducemaxabs_ptx_50 , +52: reducemaxabs_ptx_52 , +53: reducemaxabs_ptx_53 , +60: reducemaxabs_ptx_60 , +61: reducemaxabs_ptx_61 , +70: reducemaxabs_ptx_70 , +75: reducemaxabs_ptx_75 } // reducemaxabs PTX code for various compute capabilities. -const ( - reducemaxabs_ptx_30 = ` +const( + reducemaxabs_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -197,7 +198,7 @@ BB0_10: ` - reducemaxabs_ptx_35 = ` + reducemaxabs_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -315,7 +316,7 @@ BB0_10: ` - reducemaxabs_ptx_37 = ` + reducemaxabs_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -433,7 +434,7 @@ BB0_10: ` - reducemaxabs_ptx_50 = ` + reducemaxabs_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -551,7 +552,7 @@ BB0_10: ` - reducemaxabs_ptx_52 = ` + reducemaxabs_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -669,7 +670,7 @@ BB0_10: ` - reducemaxabs_ptx_53 = ` + reducemaxabs_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -787,7 +788,7 @@ BB0_10: ` - reducemaxabs_ptx_60 = ` + reducemaxabs_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -905,7 +906,7 @@ BB0_10: ` - reducemaxabs_ptx_61 = ` + reducemaxabs_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1023,7 +1024,7 @@ BB0_10: ` - reducemaxabs_ptx_70 = ` + reducemaxabs_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1141,7 +1142,7 @@ BB0_10: ` - reducemaxabs_ptx_75 = ` + reducemaxabs_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1259,4 +1260,4 @@ BB0_10: ` -) + ) diff --git a/cuda/reducemaxdiff_wrapper.go b/cuda/reducemaxdiff_wrapper.go index 7a7aa8bdc..ba2d7f2ab 100644 --- a/cuda/reducemaxdiff_wrapper.go +++ b/cuda/reducemaxdiff_wrapper.go @@ -5,42 +5,42 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducemaxdiff kernel var reducemaxdiff_code cu.Function // Stores the arguments for reducemaxdiff kernel invocation -type reducemaxdiff_args_t struct { - arg_src1 unsafe.Pointer - arg_src2 unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [5]unsafe.Pointer +type reducemaxdiff_args_t struct{ + arg_src1 unsafe.Pointer + arg_src2 unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxdiff kernel invocation var reducemaxdiff_args reducemaxdiff_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducemaxdiff_args.argptr[0] = unsafe.Pointer(&reducemaxdiff_args.arg_src1) - reducemaxdiff_args.argptr[1] = unsafe.Pointer(&reducemaxdiff_args.arg_src2) - reducemaxdiff_args.argptr[2] = unsafe.Pointer(&reducemaxdiff_args.arg_dst) - reducemaxdiff_args.argptr[3] = unsafe.Pointer(&reducemaxdiff_args.arg_initVal) - reducemaxdiff_args.argptr[4] = unsafe.Pointer(&reducemaxdiff_args.arg_n) -} + reducemaxdiff_args.argptr[0] = unsafe.Pointer(&reducemaxdiff_args.arg_src1) + reducemaxdiff_args.argptr[1] = unsafe.Pointer(&reducemaxdiff_args.arg_src2) + reducemaxdiff_args.argptr[2] = unsafe.Pointer(&reducemaxdiff_args.arg_dst) + reducemaxdiff_args.argptr[3] = unsafe.Pointer(&reducemaxdiff_args.arg_initVal) + reducemaxdiff_args.argptr[4] = unsafe.Pointer(&reducemaxdiff_args.arg_n) + } // Wrapper for reducemaxdiff CUDA kernel, asynchronous. -func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducemaxdiff_async ( src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducemaxdiff") } @@ -48,41 +48,42 @@ func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe. reducemaxdiff_args.Lock() defer reducemaxdiff_args.Unlock() - if reducemaxdiff_code == 0 { + if reducemaxdiff_code == 0{ reducemaxdiff_code = fatbinLoad(reducemaxdiff_map, "reducemaxdiff") } - reducemaxdiff_args.arg_src1 = src1 - reducemaxdiff_args.arg_src2 = src2 - reducemaxdiff_args.arg_dst = dst - reducemaxdiff_args.arg_initVal = initVal - reducemaxdiff_args.arg_n = n + reducemaxdiff_args.arg_src1 = src1 + reducemaxdiff_args.arg_src2 = src2 + reducemaxdiff_args.arg_dst = dst + reducemaxdiff_args.arg_initVal = initVal + reducemaxdiff_args.arg_n = n + args := reducemaxdiff_args.argptr[:] cu.LaunchKernel(reducemaxdiff_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducemaxdiff") } } // maps compute capability on PTX code for reducemaxdiff kernel. -var reducemaxdiff_map = map[int]string{0: "", - 30: reducemaxdiff_ptx_30, - 35: reducemaxdiff_ptx_35, - 37: reducemaxdiff_ptx_37, - 50: reducemaxdiff_ptx_50, - 52: reducemaxdiff_ptx_52, - 53: reducemaxdiff_ptx_53, - 60: reducemaxdiff_ptx_60, - 61: reducemaxdiff_ptx_61, - 70: reducemaxdiff_ptx_70, - 75: reducemaxdiff_ptx_75} +var reducemaxdiff_map = map[int]string{ 0: "" , +30: reducemaxdiff_ptx_30 , +35: reducemaxdiff_ptx_35 , +37: reducemaxdiff_ptx_37 , +50: reducemaxdiff_ptx_50 , +52: reducemaxdiff_ptx_52 , +53: reducemaxdiff_ptx_53 , +60: reducemaxdiff_ptx_60 , +61: reducemaxdiff_ptx_61 , +70: reducemaxdiff_ptx_70 , +75: reducemaxdiff_ptx_75 } // reducemaxdiff PTX code for various compute capabilities. -const ( - reducemaxdiff_ptx_30 = ` +const( + reducemaxdiff_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -206,7 +207,7 @@ BB0_10: ` - reducemaxdiff_ptx_35 = ` + reducemaxdiff_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -330,7 +331,7 @@ BB0_10: ` - reducemaxdiff_ptx_37 = ` + reducemaxdiff_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -454,7 +455,7 @@ BB0_10: ` - reducemaxdiff_ptx_50 = ` + reducemaxdiff_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -578,7 +579,7 @@ BB0_10: ` - reducemaxdiff_ptx_52 = ` + reducemaxdiff_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -702,7 +703,7 @@ BB0_10: ` - reducemaxdiff_ptx_53 = ` + reducemaxdiff_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -826,7 +827,7 @@ BB0_10: ` - reducemaxdiff_ptx_60 = ` + reducemaxdiff_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -950,7 +951,7 @@ BB0_10: ` - reducemaxdiff_ptx_61 = ` + reducemaxdiff_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1074,7 +1075,7 @@ BB0_10: ` - reducemaxdiff_ptx_70 = ` + reducemaxdiff_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1198,7 +1199,7 @@ BB0_10: ` - reducemaxdiff_ptx_75 = ` + reducemaxdiff_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1322,4 +1323,4 @@ BB0_10: ` -) + ) diff --git a/cuda/reducemaxvecdiff2_wrapper.go b/cuda/reducemaxvecdiff2_wrapper.go index 12051af67..e2d66886e 100644 --- a/cuda/reducemaxvecdiff2_wrapper.go +++ b/cuda/reducemaxvecdiff2_wrapper.go @@ -5,50 +5,50 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducemaxvecdiff2 kernel var reducemaxvecdiff2_code cu.Function // Stores the arguments for reducemaxvecdiff2 kernel invocation -type reducemaxvecdiff2_args_t struct { - arg_x1 unsafe.Pointer - arg_y1 unsafe.Pointer - arg_z1 unsafe.Pointer - arg_x2 unsafe.Pointer - arg_y2 unsafe.Pointer - arg_z2 unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [9]unsafe.Pointer +type reducemaxvecdiff2_args_t struct{ + arg_x1 unsafe.Pointer + arg_y1 unsafe.Pointer + arg_z1 unsafe.Pointer + arg_x2 unsafe.Pointer + arg_y2 unsafe.Pointer + arg_z2 unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecdiff2 kernel invocation var reducemaxvecdiff2_args reducemaxvecdiff2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducemaxvecdiff2_args.argptr[0] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x1) - reducemaxvecdiff2_args.argptr[1] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y1) - reducemaxvecdiff2_args.argptr[2] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z1) - reducemaxvecdiff2_args.argptr[3] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x2) - reducemaxvecdiff2_args.argptr[4] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y2) - reducemaxvecdiff2_args.argptr[5] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z2) - reducemaxvecdiff2_args.argptr[6] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_dst) - reducemaxvecdiff2_args.argptr[7] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_initVal) - reducemaxvecdiff2_args.argptr[8] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_n) -} + reducemaxvecdiff2_args.argptr[0] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x1) + reducemaxvecdiff2_args.argptr[1] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y1) + reducemaxvecdiff2_args.argptr[2] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z1) + reducemaxvecdiff2_args.argptr[3] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x2) + reducemaxvecdiff2_args.argptr[4] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y2) + reducemaxvecdiff2_args.argptr[5] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z2) + reducemaxvecdiff2_args.argptr[6] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_dst) + reducemaxvecdiff2_args.argptr[7] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_initVal) + reducemaxvecdiff2_args.argptr[8] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_n) + } // Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous. -func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducemaxvecdiff2_async ( x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducemaxvecdiff2") } @@ -56,45 +56,46 @@ func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.P reducemaxvecdiff2_args.Lock() defer reducemaxvecdiff2_args.Unlock() - if reducemaxvecdiff2_code == 0 { + if reducemaxvecdiff2_code == 0{ reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2") } - reducemaxvecdiff2_args.arg_x1 = x1 - reducemaxvecdiff2_args.arg_y1 = y1 - reducemaxvecdiff2_args.arg_z1 = z1 - reducemaxvecdiff2_args.arg_x2 = x2 - reducemaxvecdiff2_args.arg_y2 = y2 - reducemaxvecdiff2_args.arg_z2 = z2 - reducemaxvecdiff2_args.arg_dst = dst - reducemaxvecdiff2_args.arg_initVal = initVal - reducemaxvecdiff2_args.arg_n = n + reducemaxvecdiff2_args.arg_x1 = x1 + reducemaxvecdiff2_args.arg_y1 = y1 + reducemaxvecdiff2_args.arg_z1 = z1 + reducemaxvecdiff2_args.arg_x2 = x2 + reducemaxvecdiff2_args.arg_y2 = y2 + reducemaxvecdiff2_args.arg_z2 = z2 + reducemaxvecdiff2_args.arg_dst = dst + reducemaxvecdiff2_args.arg_initVal = initVal + reducemaxvecdiff2_args.arg_n = n + args := reducemaxvecdiff2_args.argptr[:] cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducemaxvecdiff2") } } // maps compute capability on PTX code for reducemaxvecdiff2 kernel. -var reducemaxvecdiff2_map = map[int]string{0: "", - 30: reducemaxvecdiff2_ptx_30, - 35: reducemaxvecdiff2_ptx_35, - 37: reducemaxvecdiff2_ptx_37, - 50: reducemaxvecdiff2_ptx_50, - 52: reducemaxvecdiff2_ptx_52, - 53: reducemaxvecdiff2_ptx_53, - 60: reducemaxvecdiff2_ptx_60, - 61: reducemaxvecdiff2_ptx_61, - 70: reducemaxvecdiff2_ptx_70, - 75: reducemaxvecdiff2_ptx_75} +var reducemaxvecdiff2_map = map[int]string{ 0: "" , +30: reducemaxvecdiff2_ptx_30 , +35: reducemaxvecdiff2_ptx_35 , +37: reducemaxvecdiff2_ptx_37 , +50: reducemaxvecdiff2_ptx_50 , +52: reducemaxvecdiff2_ptx_52 , +53: reducemaxvecdiff2_ptx_53 , +60: reducemaxvecdiff2_ptx_60 , +61: reducemaxvecdiff2_ptx_61 , +70: reducemaxvecdiff2_ptx_70 , +75: reducemaxvecdiff2_ptx_75 } // reducemaxvecdiff2 PTX code for various compute capabilities. -const ( - reducemaxvecdiff2_ptx_30 = ` +const( + reducemaxvecdiff2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -242,7 +243,7 @@ BB0_10: ` - reducemaxvecdiff2_ptx_35 = ` + reducemaxvecdiff2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -391,7 +392,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_37 = ` + reducemaxvecdiff2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -540,7 +541,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_50 = ` + reducemaxvecdiff2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -689,7 +690,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_52 = ` + reducemaxvecdiff2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -838,7 +839,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_53 = ` + reducemaxvecdiff2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -987,7 +988,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_60 = ` + reducemaxvecdiff2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1136,7 +1137,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_61 = ` + reducemaxvecdiff2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1285,7 +1286,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_70 = ` + reducemaxvecdiff2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1434,7 +1435,7 @@ BB0_11: ` - reducemaxvecdiff2_ptx_75 = ` + reducemaxvecdiff2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1583,4 +1584,4 @@ BB0_11: ` -) + ) diff --git a/cuda/reducemaxvecnorm2_wrapper.go b/cuda/reducemaxvecnorm2_wrapper.go index 16b19fa75..26beea3f0 100644 --- a/cuda/reducemaxvecnorm2_wrapper.go +++ b/cuda/reducemaxvecnorm2_wrapper.go @@ -5,44 +5,44 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducemaxvecnorm2 kernel var reducemaxvecnorm2_code cu.Function // Stores the arguments for reducemaxvecnorm2 kernel invocation -type reducemaxvecnorm2_args_t struct { - arg_x unsafe.Pointer - arg_y unsafe.Pointer - arg_z unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [6]unsafe.Pointer +type reducemaxvecnorm2_args_t struct{ + arg_x unsafe.Pointer + arg_y unsafe.Pointer + arg_z unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecnorm2 kernel invocation var reducemaxvecnorm2_args reducemaxvecnorm2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducemaxvecnorm2_args.argptr[0] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_x) - reducemaxvecnorm2_args.argptr[1] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_y) - reducemaxvecnorm2_args.argptr[2] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_z) - reducemaxvecnorm2_args.argptr[3] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_dst) - reducemaxvecnorm2_args.argptr[4] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_initVal) - reducemaxvecnorm2_args.argptr[5] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_n) -} + reducemaxvecnorm2_args.argptr[0] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_x) + reducemaxvecnorm2_args.argptr[1] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_y) + reducemaxvecnorm2_args.argptr[2] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_z) + reducemaxvecnorm2_args.argptr[3] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_dst) + reducemaxvecnorm2_args.argptr[4] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_initVal) + reducemaxvecnorm2_args.argptr[5] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_n) + } // Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous. -func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducemaxvecnorm2_async ( x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducemaxvecnorm2") } @@ -50,42 +50,43 @@ func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Poin reducemaxvecnorm2_args.Lock() defer reducemaxvecnorm2_args.Unlock() - if reducemaxvecnorm2_code == 0 { + if reducemaxvecnorm2_code == 0{ reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2") } - reducemaxvecnorm2_args.arg_x = x - reducemaxvecnorm2_args.arg_y = y - reducemaxvecnorm2_args.arg_z = z - reducemaxvecnorm2_args.arg_dst = dst - reducemaxvecnorm2_args.arg_initVal = initVal - reducemaxvecnorm2_args.arg_n = n + reducemaxvecnorm2_args.arg_x = x + reducemaxvecnorm2_args.arg_y = y + reducemaxvecnorm2_args.arg_z = z + reducemaxvecnorm2_args.arg_dst = dst + reducemaxvecnorm2_args.arg_initVal = initVal + reducemaxvecnorm2_args.arg_n = n + args := reducemaxvecnorm2_args.argptr[:] cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducemaxvecnorm2") } } // maps compute capability on PTX code for reducemaxvecnorm2 kernel. -var reducemaxvecnorm2_map = map[int]string{0: "", - 30: reducemaxvecnorm2_ptx_30, - 35: reducemaxvecnorm2_ptx_35, - 37: reducemaxvecnorm2_ptx_37, - 50: reducemaxvecnorm2_ptx_50, - 52: reducemaxvecnorm2_ptx_52, - 53: reducemaxvecnorm2_ptx_53, - 60: reducemaxvecnorm2_ptx_60, - 61: reducemaxvecnorm2_ptx_61, - 70: reducemaxvecnorm2_ptx_70, - 75: reducemaxvecnorm2_ptx_75} +var reducemaxvecnorm2_map = map[int]string{ 0: "" , +30: reducemaxvecnorm2_ptx_30 , +35: reducemaxvecnorm2_ptx_35 , +37: reducemaxvecnorm2_ptx_37 , +50: reducemaxvecnorm2_ptx_50 , +52: reducemaxvecnorm2_ptx_52 , +53: reducemaxvecnorm2_ptx_53 , +60: reducemaxvecnorm2_ptx_60 , +61: reducemaxvecnorm2_ptx_61 , +70: reducemaxvecnorm2_ptx_70 , +75: reducemaxvecnorm2_ptx_75 } // reducemaxvecnorm2 PTX code for various compute capabilities. -const ( - reducemaxvecnorm2_ptx_30 = ` +const( + reducemaxvecnorm2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -215,7 +216,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_35 = ` + reducemaxvecnorm2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -345,7 +346,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_37 = ` + reducemaxvecnorm2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -475,7 +476,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_50 = ` + reducemaxvecnorm2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -605,7 +606,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_52 = ` + reducemaxvecnorm2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -735,7 +736,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_53 = ` + reducemaxvecnorm2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -865,7 +866,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_60 = ` + reducemaxvecnorm2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -995,7 +996,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_61 = ` + reducemaxvecnorm2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1125,7 +1126,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_70 = ` + reducemaxvecnorm2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1255,7 +1256,7 @@ BB0_10: ` - reducemaxvecnorm2_ptx_75 = ` + reducemaxvecnorm2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1385,4 +1386,4 @@ BB0_10: ` -) + ) diff --git a/cuda/reducesum_wrapper.go b/cuda/reducesum_wrapper.go index a3b2688d4..4adea9b0c 100644 --- a/cuda/reducesum_wrapper.go +++ b/cuda/reducesum_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for reducesum kernel var reducesum_code cu.Function // Stores the arguments for reducesum kernel invocation -type reducesum_args_t struct { - arg_src unsafe.Pointer - arg_dst unsafe.Pointer - arg_initVal float32 - arg_n int - argptr [4]unsafe.Pointer +type reducesum_args_t struct{ + arg_src unsafe.Pointer + arg_dst unsafe.Pointer + arg_initVal float32 + arg_n int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducesum kernel invocation var reducesum_args reducesum_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - reducesum_args.argptr[0] = unsafe.Pointer(&reducesum_args.arg_src) - reducesum_args.argptr[1] = unsafe.Pointer(&reducesum_args.arg_dst) - reducesum_args.argptr[2] = unsafe.Pointer(&reducesum_args.arg_initVal) - reducesum_args.argptr[3] = unsafe.Pointer(&reducesum_args.arg_n) -} + reducesum_args.argptr[0] = unsafe.Pointer(&reducesum_args.arg_src) + reducesum_args.argptr[1] = unsafe.Pointer(&reducesum_args.arg_dst) + reducesum_args.argptr[2] = unsafe.Pointer(&reducesum_args.arg_initVal) + reducesum_args.argptr[3] = unsafe.Pointer(&reducesum_args.arg_n) + } // Wrapper for reducesum CUDA kernel, asynchronous. -func k_reducesum_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { - if Synchronous { // debug +func k_reducesum_async ( src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("reducesum") } @@ -46,40 +46,41 @@ func k_reducesum_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, reducesum_args.Lock() defer reducesum_args.Unlock() - if reducesum_code == 0 { + if reducesum_code == 0{ reducesum_code = fatbinLoad(reducesum_map, "reducesum") } - reducesum_args.arg_src = src - reducesum_args.arg_dst = dst - reducesum_args.arg_initVal = initVal - reducesum_args.arg_n = n + reducesum_args.arg_src = src + reducesum_args.arg_dst = dst + reducesum_args.arg_initVal = initVal + reducesum_args.arg_n = n + args := reducesum_args.argptr[:] cu.LaunchKernel(reducesum_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("reducesum") } } // maps compute capability on PTX code for reducesum kernel. -var reducesum_map = map[int]string{0: "", - 30: reducesum_ptx_30, - 35: reducesum_ptx_35, - 37: reducesum_ptx_37, - 50: reducesum_ptx_50, - 52: reducesum_ptx_52, - 53: reducesum_ptx_53, - 60: reducesum_ptx_60, - 61: reducesum_ptx_61, - 70: reducesum_ptx_70, - 75: reducesum_ptx_75} +var reducesum_map = map[int]string{ 0: "" , +30: reducesum_ptx_30 , +35: reducesum_ptx_35 , +37: reducesum_ptx_37 , +50: reducesum_ptx_50 , +52: reducesum_ptx_52 , +53: reducesum_ptx_53 , +60: reducesum_ptx_60 , +61: reducesum_ptx_61 , +70: reducesum_ptx_70 , +75: reducesum_ptx_75 } // reducesum PTX code for various compute capabilities. -const ( - reducesum_ptx_30 = ` +const( + reducesum_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -194,7 +195,7 @@ BB0_10: ` - reducesum_ptx_35 = ` + reducesum_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -309,7 +310,7 @@ BB0_10: ` - reducesum_ptx_37 = ` + reducesum_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -424,7 +425,7 @@ BB0_10: ` - reducesum_ptx_50 = ` + reducesum_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -539,7 +540,7 @@ BB0_10: ` - reducesum_ptx_52 = ` + reducesum_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -654,7 +655,7 @@ BB0_10: ` - reducesum_ptx_53 = ` + reducesum_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -769,7 +770,7 @@ BB0_10: ` - reducesum_ptx_60 = ` + reducesum_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -884,7 +885,7 @@ BB0_10: ` - reducesum_ptx_61 = ` + reducesum_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -999,7 +1000,7 @@ BB0_10: ` - reducesum_ptx_70 = ` + reducesum_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1114,7 +1115,7 @@ BB0_10: ` - reducesum_ptx_75 = ` + reducesum_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1229,4 +1230,4 @@ BB0_10: ` -) + ) diff --git a/cuda/regionadds_wrapper.go b/cuda/regionadds_wrapper.go index d69f2645d..4dbbfd9de 100644 --- a/cuda/regionadds_wrapper.go +++ b/cuda/regionadds_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for regionadds kernel var regionadds_code cu.Function // Stores the arguments for regionadds kernel invocation -type regionadds_args_t struct { - arg_dst unsafe.Pointer - arg_LUT unsafe.Pointer - arg_regions unsafe.Pointer - arg_N int - argptr [4]unsafe.Pointer +type regionadds_args_t struct{ + arg_dst unsafe.Pointer + arg_LUT unsafe.Pointer + arg_regions unsafe.Pointer + arg_N int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regionadds kernel invocation var regionadds_args regionadds_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - regionadds_args.argptr[0] = unsafe.Pointer(®ionadds_args.arg_dst) - regionadds_args.argptr[1] = unsafe.Pointer(®ionadds_args.arg_LUT) - regionadds_args.argptr[2] = unsafe.Pointer(®ionadds_args.arg_regions) - regionadds_args.argptr[3] = unsafe.Pointer(®ionadds_args.arg_N) -} + regionadds_args.argptr[0] = unsafe.Pointer(®ionadds_args.arg_dst) + regionadds_args.argptr[1] = unsafe.Pointer(®ionadds_args.arg_LUT) + regionadds_args.argptr[2] = unsafe.Pointer(®ionadds_args.arg_regions) + regionadds_args.argptr[3] = unsafe.Pointer(®ionadds_args.arg_N) + } // Wrapper for regionadds CUDA kernel, asynchronous. -func k_regionadds_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_regionadds_async ( dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("regionadds") } @@ -46,40 +46,41 @@ func k_regionadds_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.P regionadds_args.Lock() defer regionadds_args.Unlock() - if regionadds_code == 0 { + if regionadds_code == 0{ regionadds_code = fatbinLoad(regionadds_map, "regionadds") } - regionadds_args.arg_dst = dst - regionadds_args.arg_LUT = LUT - regionadds_args.arg_regions = regions - regionadds_args.arg_N = N + regionadds_args.arg_dst = dst + regionadds_args.arg_LUT = LUT + regionadds_args.arg_regions = regions + regionadds_args.arg_N = N + args := regionadds_args.argptr[:] cu.LaunchKernel(regionadds_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("regionadds") } } // maps compute capability on PTX code for regionadds kernel. -var regionadds_map = map[int]string{0: "", - 30: regionadds_ptx_30, - 35: regionadds_ptx_35, - 37: regionadds_ptx_37, - 50: regionadds_ptx_50, - 52: regionadds_ptx_52, - 53: regionadds_ptx_53, - 60: regionadds_ptx_60, - 61: regionadds_ptx_61, - 70: regionadds_ptx_70, - 75: regionadds_ptx_75} +var regionadds_map = map[int]string{ 0: "" , +30: regionadds_ptx_30 , +35: regionadds_ptx_35 , +37: regionadds_ptx_37 , +50: regionadds_ptx_50 , +52: regionadds_ptx_52 , +53: regionadds_ptx_53 , +60: regionadds_ptx_60 , +61: regionadds_ptx_61 , +70: regionadds_ptx_70 , +75: regionadds_ptx_75 } // regionadds PTX code for various compute capabilities. -const ( - regionadds_ptx_30 = ` +const( + regionadds_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -134,7 +135,7 @@ BB0_2: ` - regionadds_ptx_35 = ` + regionadds_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -192,7 +193,7 @@ BB0_2: ` - regionadds_ptx_37 = ` + regionadds_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -250,7 +251,7 @@ BB0_2: ` - regionadds_ptx_50 = ` + regionadds_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -308,7 +309,7 @@ BB0_2: ` - regionadds_ptx_52 = ` + regionadds_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -366,7 +367,7 @@ BB0_2: ` - regionadds_ptx_53 = ` + regionadds_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -424,7 +425,7 @@ BB0_2: ` - regionadds_ptx_60 = ` + regionadds_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -482,7 +483,7 @@ BB0_2: ` - regionadds_ptx_61 = ` + regionadds_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -540,7 +541,7 @@ BB0_2: ` - regionadds_ptx_70 = ` + regionadds_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -598,7 +599,7 @@ BB0_2: ` - regionadds_ptx_75 = ` + regionadds_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -656,4 +657,4 @@ BB0_2: ` -) + ) diff --git a/cuda/regionaddv_wrapper.go b/cuda/regionaddv_wrapper.go index d668056d3..81cbbe626 100644 --- a/cuda/regionaddv_wrapper.go +++ b/cuda/regionaddv_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for regionaddv kernel var regionaddv_code cu.Function // Stores the arguments for regionaddv kernel invocation -type regionaddv_args_t struct { - arg_dstx unsafe.Pointer - arg_dsty unsafe.Pointer - arg_dstz unsafe.Pointer - arg_LUTx unsafe.Pointer - arg_LUTy unsafe.Pointer - arg_LUTz unsafe.Pointer - arg_regions unsafe.Pointer - arg_N int - argptr [8]unsafe.Pointer +type regionaddv_args_t struct{ + arg_dstx unsafe.Pointer + arg_dsty unsafe.Pointer + arg_dstz unsafe.Pointer + arg_LUTx unsafe.Pointer + arg_LUTy unsafe.Pointer + arg_LUTz unsafe.Pointer + arg_regions unsafe.Pointer + arg_N int + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for regionaddv kernel invocation var regionaddv_args regionaddv_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - regionaddv_args.argptr[0] = unsafe.Pointer(®ionaddv_args.arg_dstx) - regionaddv_args.argptr[1] = unsafe.Pointer(®ionaddv_args.arg_dsty) - regionaddv_args.argptr[2] = unsafe.Pointer(®ionaddv_args.arg_dstz) - regionaddv_args.argptr[3] = unsafe.Pointer(®ionaddv_args.arg_LUTx) - regionaddv_args.argptr[4] = unsafe.Pointer(®ionaddv_args.arg_LUTy) - regionaddv_args.argptr[5] = unsafe.Pointer(®ionaddv_args.arg_LUTz) - regionaddv_args.argptr[6] = unsafe.Pointer(®ionaddv_args.arg_regions) - regionaddv_args.argptr[7] = unsafe.Pointer(®ionaddv_args.arg_N) -} + regionaddv_args.argptr[0] = unsafe.Pointer(®ionaddv_args.arg_dstx) + regionaddv_args.argptr[1] = unsafe.Pointer(®ionaddv_args.arg_dsty) + regionaddv_args.argptr[2] = unsafe.Pointer(®ionaddv_args.arg_dstz) + regionaddv_args.argptr[3] = unsafe.Pointer(®ionaddv_args.arg_LUTx) + regionaddv_args.argptr[4] = unsafe.Pointer(®ionaddv_args.arg_LUTy) + regionaddv_args.argptr[5] = unsafe.Pointer(®ionaddv_args.arg_LUTz) + regionaddv_args.argptr[6] = unsafe.Pointer(®ionaddv_args.arg_regions) + regionaddv_args.argptr[7] = unsafe.Pointer(®ionaddv_args.arg_N) + } // Wrapper for regionaddv CUDA kernel, asynchronous. -func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_regionaddv_async ( dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("regionaddv") } @@ -54,44 +54,45 @@ func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Po regionaddv_args.Lock() defer regionaddv_args.Unlock() - if regionaddv_code == 0 { + if regionaddv_code == 0{ regionaddv_code = fatbinLoad(regionaddv_map, "regionaddv") } - regionaddv_args.arg_dstx = dstx - regionaddv_args.arg_dsty = dsty - regionaddv_args.arg_dstz = dstz - regionaddv_args.arg_LUTx = LUTx - regionaddv_args.arg_LUTy = LUTy - regionaddv_args.arg_LUTz = LUTz - regionaddv_args.arg_regions = regions - regionaddv_args.arg_N = N + regionaddv_args.arg_dstx = dstx + regionaddv_args.arg_dsty = dsty + regionaddv_args.arg_dstz = dstz + regionaddv_args.arg_LUTx = LUTx + regionaddv_args.arg_LUTy = LUTy + regionaddv_args.arg_LUTz = LUTz + regionaddv_args.arg_regions = regions + regionaddv_args.arg_N = N + args := regionaddv_args.argptr[:] cu.LaunchKernel(regionaddv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("regionaddv") } } // maps compute capability on PTX code for regionaddv kernel. -var regionaddv_map = map[int]string{0: "", - 30: regionaddv_ptx_30, - 35: regionaddv_ptx_35, - 37: regionaddv_ptx_37, - 50: regionaddv_ptx_50, - 52: regionaddv_ptx_52, - 53: regionaddv_ptx_53, - 60: regionaddv_ptx_60, - 61: regionaddv_ptx_61, - 70: regionaddv_ptx_70, - 75: regionaddv_ptx_75} +var regionaddv_map = map[int]string{ 0: "" , +30: regionaddv_ptx_30 , +35: regionaddv_ptx_35 , +37: regionaddv_ptx_37 , +50: regionaddv_ptx_50 , +52: regionaddv_ptx_52 , +53: regionaddv_ptx_53 , +60: regionaddv_ptx_60 , +61: regionaddv_ptx_61 , +70: regionaddv_ptx_70 , +75: regionaddv_ptx_75 } // regionaddv PTX code for various compute capabilities. -const ( - regionaddv_ptx_30 = ` +const( + regionaddv_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -170,7 +171,7 @@ BB0_2: ` - regionaddv_ptx_35 = ` + regionaddv_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -252,7 +253,7 @@ BB0_2: ` - regionaddv_ptx_37 = ` + regionaddv_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -334,7 +335,7 @@ BB0_2: ` - regionaddv_ptx_50 = ` + regionaddv_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -416,7 +417,7 @@ BB0_2: ` - regionaddv_ptx_52 = ` + regionaddv_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -498,7 +499,7 @@ BB0_2: ` - regionaddv_ptx_53 = ` + regionaddv_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -580,7 +581,7 @@ BB0_2: ` - regionaddv_ptx_60 = ` + regionaddv_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -662,7 +663,7 @@ BB0_2: ` - regionaddv_ptx_61 = ` + regionaddv_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -744,7 +745,7 @@ BB0_2: ` - regionaddv_ptx_70 = ` + regionaddv_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -826,7 +827,7 @@ BB0_2: ` - regionaddv_ptx_75 = ` + regionaddv_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -908,4 +909,4 @@ BB0_2: ` -) + ) diff --git a/cuda/regiondecode_wrapper.go b/cuda/regiondecode_wrapper.go index f79f41dcc..9f4c44f2e 100644 --- a/cuda/regiondecode_wrapper.go +++ b/cuda/regiondecode_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for regiondecode kernel var regiondecode_code cu.Function // Stores the arguments for regiondecode kernel invocation -type regiondecode_args_t struct { - arg_dst unsafe.Pointer - arg_LUT unsafe.Pointer - arg_regions unsafe.Pointer - arg_N int - argptr [4]unsafe.Pointer +type regiondecode_args_t struct{ + arg_dst unsafe.Pointer + arg_LUT unsafe.Pointer + arg_regions unsafe.Pointer + arg_N int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regiondecode kernel invocation var regiondecode_args regiondecode_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - regiondecode_args.argptr[0] = unsafe.Pointer(®iondecode_args.arg_dst) - regiondecode_args.argptr[1] = unsafe.Pointer(®iondecode_args.arg_LUT) - regiondecode_args.argptr[2] = unsafe.Pointer(®iondecode_args.arg_regions) - regiondecode_args.argptr[3] = unsafe.Pointer(®iondecode_args.arg_N) -} + regiondecode_args.argptr[0] = unsafe.Pointer(®iondecode_args.arg_dst) + regiondecode_args.argptr[1] = unsafe.Pointer(®iondecode_args.arg_LUT) + regiondecode_args.argptr[2] = unsafe.Pointer(®iondecode_args.arg_regions) + regiondecode_args.argptr[3] = unsafe.Pointer(®iondecode_args.arg_N) + } // Wrapper for regiondecode CUDA kernel, asynchronous. -func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_regiondecode_async ( dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("regiondecode") } @@ -46,40 +46,41 @@ func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe regiondecode_args.Lock() defer regiondecode_args.Unlock() - if regiondecode_code == 0 { + if regiondecode_code == 0{ regiondecode_code = fatbinLoad(regiondecode_map, "regiondecode") } - regiondecode_args.arg_dst = dst - regiondecode_args.arg_LUT = LUT - regiondecode_args.arg_regions = regions - regiondecode_args.arg_N = N + regiondecode_args.arg_dst = dst + regiondecode_args.arg_LUT = LUT + regiondecode_args.arg_regions = regions + regiondecode_args.arg_N = N + args := regiondecode_args.argptr[:] cu.LaunchKernel(regiondecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("regiondecode") } } // maps compute capability on PTX code for regiondecode kernel. -var regiondecode_map = map[int]string{0: "", - 30: regiondecode_ptx_30, - 35: regiondecode_ptx_35, - 37: regiondecode_ptx_37, - 50: regiondecode_ptx_50, - 52: regiondecode_ptx_52, - 53: regiondecode_ptx_53, - 60: regiondecode_ptx_60, - 61: regiondecode_ptx_61, - 70: regiondecode_ptx_70, - 75: regiondecode_ptx_75} +var regiondecode_map = map[int]string{ 0: "" , +30: regiondecode_ptx_30 , +35: regiondecode_ptx_35 , +37: regiondecode_ptx_37 , +50: regiondecode_ptx_50 , +52: regiondecode_ptx_52 , +53: regiondecode_ptx_53 , +60: regiondecode_ptx_60 , +61: regiondecode_ptx_61 , +70: regiondecode_ptx_70 , +75: regiondecode_ptx_75 } // regiondecode PTX code for various compute capabilities. -const ( - regiondecode_ptx_30 = ` +const( + regiondecode_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -132,7 +133,7 @@ BB0_2: ` - regiondecode_ptx_35 = ` + regiondecode_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -188,7 +189,7 @@ BB0_2: ` - regiondecode_ptx_37 = ` + regiondecode_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -244,7 +245,7 @@ BB0_2: ` - regiondecode_ptx_50 = ` + regiondecode_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -300,7 +301,7 @@ BB0_2: ` - regiondecode_ptx_52 = ` + regiondecode_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -356,7 +357,7 @@ BB0_2: ` - regiondecode_ptx_53 = ` + regiondecode_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -412,7 +413,7 @@ BB0_2: ` - regiondecode_ptx_60 = ` + regiondecode_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -468,7 +469,7 @@ BB0_2: ` - regiondecode_ptx_61 = ` + regiondecode_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -524,7 +525,7 @@ BB0_2: ` - regiondecode_ptx_70 = ` + regiondecode_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -580,7 +581,7 @@ BB0_2: ` - regiondecode_ptx_75 = ` + regiondecode_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -636,4 +637,4 @@ BB0_2: ` -) + ) diff --git a/cuda/regionselect_wrapper.go b/cuda/regionselect_wrapper.go index eaa282953..6c5cd8a6d 100644 --- a/cuda/regionselect_wrapper.go +++ b/cuda/regionselect_wrapper.go @@ -5,42 +5,42 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for regionselect kernel var regionselect_code cu.Function // Stores the arguments for regionselect kernel invocation -type regionselect_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_regions unsafe.Pointer - arg_region byte - arg_N int - argptr [5]unsafe.Pointer +type regionselect_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_regions unsafe.Pointer + arg_region byte + arg_N int + argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for regionselect kernel invocation var regionselect_args regionselect_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - regionselect_args.argptr[0] = unsafe.Pointer(®ionselect_args.arg_dst) - regionselect_args.argptr[1] = unsafe.Pointer(®ionselect_args.arg_src) - regionselect_args.argptr[2] = unsafe.Pointer(®ionselect_args.arg_regions) - regionselect_args.argptr[3] = unsafe.Pointer(®ionselect_args.arg_region) - regionselect_args.argptr[4] = unsafe.Pointer(®ionselect_args.arg_N) -} + regionselect_args.argptr[0] = unsafe.Pointer(®ionselect_args.arg_dst) + regionselect_args.argptr[1] = unsafe.Pointer(®ionselect_args.arg_src) + regionselect_args.argptr[2] = unsafe.Pointer(®ionselect_args.arg_regions) + regionselect_args.argptr[3] = unsafe.Pointer(®ionselect_args.arg_region) + regionselect_args.argptr[4] = unsafe.Pointer(®ionselect_args.arg_N) + } // Wrapper for regionselect CUDA kernel, asynchronous. -func k_regionselect_async(dst unsafe.Pointer, src unsafe.Pointer, regions unsafe.Pointer, region byte, N int, cfg *config) { - if Synchronous { // debug +func k_regionselect_async ( dst unsafe.Pointer, src unsafe.Pointer, regions unsafe.Pointer, region byte, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("regionselect") } @@ -48,41 +48,42 @@ func k_regionselect_async(dst unsafe.Pointer, src unsafe.Pointer, regions unsafe regionselect_args.Lock() defer regionselect_args.Unlock() - if regionselect_code == 0 { + if regionselect_code == 0{ regionselect_code = fatbinLoad(regionselect_map, "regionselect") } - regionselect_args.arg_dst = dst - regionselect_args.arg_src = src - regionselect_args.arg_regions = regions - regionselect_args.arg_region = region - regionselect_args.arg_N = N + regionselect_args.arg_dst = dst + regionselect_args.arg_src = src + regionselect_args.arg_regions = regions + regionselect_args.arg_region = region + regionselect_args.arg_N = N + args := regionselect_args.argptr[:] cu.LaunchKernel(regionselect_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("regionselect") } } // maps compute capability on PTX code for regionselect kernel. -var regionselect_map = map[int]string{0: "", - 30: regionselect_ptx_30, - 35: regionselect_ptx_35, - 37: regionselect_ptx_37, - 50: regionselect_ptx_50, - 52: regionselect_ptx_52, - 53: regionselect_ptx_53, - 60: regionselect_ptx_60, - 61: regionselect_ptx_61, - 70: regionselect_ptx_70, - 75: regionselect_ptx_75} +var regionselect_map = map[int]string{ 0: "" , +30: regionselect_ptx_30 , +35: regionselect_ptx_35 , +37: regionselect_ptx_37 , +50: regionselect_ptx_50 , +52: regionselect_ptx_52 , +53: regionselect_ptx_53 , +60: regionselect_ptx_60 , +61: regionselect_ptx_61 , +70: regionselect_ptx_70 , +75: regionselect_ptx_75 } // regionselect PTX code for various compute capabilities. -const ( - regionselect_ptx_30 = ` +const( + regionselect_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -144,7 +145,7 @@ BB0_4: ` - regionselect_ptx_35 = ` + regionselect_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -206,7 +207,7 @@ BB0_4: ` - regionselect_ptx_37 = ` + regionselect_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -268,7 +269,7 @@ BB0_4: ` - regionselect_ptx_50 = ` + regionselect_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -330,7 +331,7 @@ BB0_4: ` - regionselect_ptx_52 = ` + regionselect_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -392,7 +393,7 @@ BB0_4: ` - regionselect_ptx_53 = ` + regionselect_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -454,7 +455,7 @@ BB0_4: ` - regionselect_ptx_60 = ` + regionselect_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -516,7 +517,7 @@ BB0_4: ` - regionselect_ptx_61 = ` + regionselect_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -578,7 +579,7 @@ BB0_4: ` - regionselect_ptx_70 = ` + regionselect_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -640,7 +641,7 @@ BB0_4: ` - regionselect_ptx_75 = ` + regionselect_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -702,4 +703,4 @@ BB0_4: ` -) + ) diff --git a/cuda/resize_wrapper.go b/cuda/resize_wrapper.go index 11140a5a3..6ffd7388f 100644 --- a/cuda/resize_wrapper.go +++ b/cuda/resize_wrapper.go @@ -5,54 +5,54 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for resize kernel var resize_code cu.Function // Stores the arguments for resize kernel invocation -type resize_args_t struct { - arg_dst unsafe.Pointer - arg_Dx int - arg_Dy int - arg_Dz int - arg_src unsafe.Pointer - arg_Sx int - arg_Sy int - arg_Sz int - arg_layer int - arg_scalex int - arg_scaley int - argptr [11]unsafe.Pointer +type resize_args_t struct{ + arg_dst unsafe.Pointer + arg_Dx int + arg_Dy int + arg_Dz int + arg_src unsafe.Pointer + arg_Sx int + arg_Sy int + arg_Sz int + arg_layer int + arg_scalex int + arg_scaley int + argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for resize kernel invocation var resize_args resize_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - resize_args.argptr[0] = unsafe.Pointer(&resize_args.arg_dst) - resize_args.argptr[1] = unsafe.Pointer(&resize_args.arg_Dx) - resize_args.argptr[2] = unsafe.Pointer(&resize_args.arg_Dy) - resize_args.argptr[3] = unsafe.Pointer(&resize_args.arg_Dz) - resize_args.argptr[4] = unsafe.Pointer(&resize_args.arg_src) - resize_args.argptr[5] = unsafe.Pointer(&resize_args.arg_Sx) - resize_args.argptr[6] = unsafe.Pointer(&resize_args.arg_Sy) - resize_args.argptr[7] = unsafe.Pointer(&resize_args.arg_Sz) - resize_args.argptr[8] = unsafe.Pointer(&resize_args.arg_layer) - resize_args.argptr[9] = unsafe.Pointer(&resize_args.arg_scalex) - resize_args.argptr[10] = unsafe.Pointer(&resize_args.arg_scaley) -} + resize_args.argptr[0] = unsafe.Pointer(&resize_args.arg_dst) + resize_args.argptr[1] = unsafe.Pointer(&resize_args.arg_Dx) + resize_args.argptr[2] = unsafe.Pointer(&resize_args.arg_Dy) + resize_args.argptr[3] = unsafe.Pointer(&resize_args.arg_Dz) + resize_args.argptr[4] = unsafe.Pointer(&resize_args.arg_src) + resize_args.argptr[5] = unsafe.Pointer(&resize_args.arg_Sx) + resize_args.argptr[6] = unsafe.Pointer(&resize_args.arg_Sy) + resize_args.argptr[7] = unsafe.Pointer(&resize_args.arg_Sz) + resize_args.argptr[8] = unsafe.Pointer(&resize_args.arg_layer) + resize_args.argptr[9] = unsafe.Pointer(&resize_args.arg_scalex) + resize_args.argptr[10] = unsafe.Pointer(&resize_args.arg_scaley) + } // Wrapper for resize CUDA kernel, asynchronous. -func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) { - if Synchronous { // debug +func k_resize_async ( dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("resize") } @@ -60,47 +60,48 @@ func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Point resize_args.Lock() defer resize_args.Unlock() - if resize_code == 0 { + if resize_code == 0{ resize_code = fatbinLoad(resize_map, "resize") } - resize_args.arg_dst = dst - resize_args.arg_Dx = Dx - resize_args.arg_Dy = Dy - resize_args.arg_Dz = Dz - resize_args.arg_src = src - resize_args.arg_Sx = Sx - resize_args.arg_Sy = Sy - resize_args.arg_Sz = Sz - resize_args.arg_layer = layer - resize_args.arg_scalex = scalex - resize_args.arg_scaley = scaley + resize_args.arg_dst = dst + resize_args.arg_Dx = Dx + resize_args.arg_Dy = Dy + resize_args.arg_Dz = Dz + resize_args.arg_src = src + resize_args.arg_Sx = Sx + resize_args.arg_Sy = Sy + resize_args.arg_Sz = Sz + resize_args.arg_layer = layer + resize_args.arg_scalex = scalex + resize_args.arg_scaley = scaley + args := resize_args.argptr[:] cu.LaunchKernel(resize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("resize") } } // maps compute capability on PTX code for resize kernel. -var resize_map = map[int]string{0: "", - 30: resize_ptx_30, - 35: resize_ptx_35, - 37: resize_ptx_37, - 50: resize_ptx_50, - 52: resize_ptx_52, - 53: resize_ptx_53, - 60: resize_ptx_60, - 61: resize_ptx_61, - 70: resize_ptx_70, - 75: resize_ptx_75} +var resize_map = map[int]string{ 0: "" , +30: resize_ptx_30 , +35: resize_ptx_35 , +37: resize_ptx_37 , +50: resize_ptx_50 , +52: resize_ptx_52 , +53: resize_ptx_53 , +60: resize_ptx_60 , +61: resize_ptx_61 , +70: resize_ptx_70 , +75: resize_ptx_75 } // resize PTX code for various compute capabilities. -const ( - resize_ptx_30 = ` +const( + resize_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -331,7 +332,7 @@ BB0_29: ` - resize_ptx_35 = ` + resize_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -561,7 +562,7 @@ BB0_29: ` - resize_ptx_37 = ` + resize_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -791,7 +792,7 @@ BB0_29: ` - resize_ptx_50 = ` + resize_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1021,7 +1022,7 @@ BB0_29: ` - resize_ptx_52 = ` + resize_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1251,7 +1252,7 @@ BB0_29: ` - resize_ptx_53 = ` + resize_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -1481,7 +1482,7 @@ BB0_29: ` - resize_ptx_60 = ` + resize_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1711,7 +1712,7 @@ BB0_29: ` - resize_ptx_61 = ` + resize_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1941,7 +1942,7 @@ BB0_29: ` - resize_ptx_70 = ` + resize_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -2171,7 +2172,7 @@ BB0_29: ` - resize_ptx_75 = ` + resize_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -2401,4 +2402,4 @@ BB0_29: ` -) + ) diff --git a/cuda/shiftbytes_wrapper.go b/cuda/shiftbytes_wrapper.go index 377ccfcfb..d023adef9 100644 --- a/cuda/shiftbytes_wrapper.go +++ b/cuda/shiftbytes_wrapper.go @@ -5,46 +5,46 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for shiftbytes kernel var shiftbytes_code cu.Function // Stores the arguments for shiftbytes kernel invocation -type shiftbytes_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_shx int - arg_clamp byte - argptr [7]unsafe.Pointer +type shiftbytes_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shx int + arg_clamp byte + argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytes kernel invocation var shiftbytes_args shiftbytes_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - shiftbytes_args.argptr[0] = unsafe.Pointer(&shiftbytes_args.arg_dst) - shiftbytes_args.argptr[1] = unsafe.Pointer(&shiftbytes_args.arg_src) - shiftbytes_args.argptr[2] = unsafe.Pointer(&shiftbytes_args.arg_Nx) - shiftbytes_args.argptr[3] = unsafe.Pointer(&shiftbytes_args.arg_Ny) - shiftbytes_args.argptr[4] = unsafe.Pointer(&shiftbytes_args.arg_Nz) - shiftbytes_args.argptr[5] = unsafe.Pointer(&shiftbytes_args.arg_shx) - shiftbytes_args.argptr[6] = unsafe.Pointer(&shiftbytes_args.arg_clamp) -} + shiftbytes_args.argptr[0] = unsafe.Pointer(&shiftbytes_args.arg_dst) + shiftbytes_args.argptr[1] = unsafe.Pointer(&shiftbytes_args.arg_src) + shiftbytes_args.argptr[2] = unsafe.Pointer(&shiftbytes_args.arg_Nx) + shiftbytes_args.argptr[3] = unsafe.Pointer(&shiftbytes_args.arg_Ny) + shiftbytes_args.argptr[4] = unsafe.Pointer(&shiftbytes_args.arg_Nz) + shiftbytes_args.argptr[5] = unsafe.Pointer(&shiftbytes_args.arg_shx) + shiftbytes_args.argptr[6] = unsafe.Pointer(&shiftbytes_args.arg_clamp) + } // Wrapper for shiftbytes CUDA kernel, asynchronous. -func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) { - if Synchronous { // debug +func k_shiftbytes_async ( dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("shiftbytes") } @@ -52,43 +52,44 @@ func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, shiftbytes_args.Lock() defer shiftbytes_args.Unlock() - if shiftbytes_code == 0 { + if shiftbytes_code == 0{ shiftbytes_code = fatbinLoad(shiftbytes_map, "shiftbytes") } - shiftbytes_args.arg_dst = dst - shiftbytes_args.arg_src = src - shiftbytes_args.arg_Nx = Nx - shiftbytes_args.arg_Ny = Ny - shiftbytes_args.arg_Nz = Nz - shiftbytes_args.arg_shx = shx - shiftbytes_args.arg_clamp = clamp + shiftbytes_args.arg_dst = dst + shiftbytes_args.arg_src = src + shiftbytes_args.arg_Nx = Nx + shiftbytes_args.arg_Ny = Ny + shiftbytes_args.arg_Nz = Nz + shiftbytes_args.arg_shx = shx + shiftbytes_args.arg_clamp = clamp + args := shiftbytes_args.argptr[:] cu.LaunchKernel(shiftbytes_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("shiftbytes") } } // maps compute capability on PTX code for shiftbytes kernel. -var shiftbytes_map = map[int]string{0: "", - 30: shiftbytes_ptx_30, - 35: shiftbytes_ptx_35, - 37: shiftbytes_ptx_37, - 50: shiftbytes_ptx_50, - 52: shiftbytes_ptx_52, - 53: shiftbytes_ptx_53, - 60: shiftbytes_ptx_60, - 61: shiftbytes_ptx_61, - 70: shiftbytes_ptx_70, - 75: shiftbytes_ptx_75} +var shiftbytes_map = map[int]string{ 0: "" , +30: shiftbytes_ptx_30 , +35: shiftbytes_ptx_35 , +37: shiftbytes_ptx_37 , +50: shiftbytes_ptx_50 , +52: shiftbytes_ptx_52 , +53: shiftbytes_ptx_53 , +60: shiftbytes_ptx_60 , +61: shiftbytes_ptx_61 , +70: shiftbytes_ptx_70 , +75: shiftbytes_ptx_75 } // shiftbytes PTX code for various compute capabilities. -const ( - shiftbytes_ptx_30 = ` +const( + shiftbytes_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -166,7 +167,7 @@ BB0_4: ` - shiftbytes_ptx_35 = ` + shiftbytes_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -244,7 +245,7 @@ BB0_4: ` - shiftbytes_ptx_37 = ` + shiftbytes_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -322,7 +323,7 @@ BB0_4: ` - shiftbytes_ptx_50 = ` + shiftbytes_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -400,7 +401,7 @@ BB0_4: ` - shiftbytes_ptx_52 = ` + shiftbytes_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -478,7 +479,7 @@ BB0_4: ` - shiftbytes_ptx_53 = ` + shiftbytes_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -556,7 +557,7 @@ BB0_4: ` - shiftbytes_ptx_60 = ` + shiftbytes_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -634,7 +635,7 @@ BB0_4: ` - shiftbytes_ptx_61 = ` + shiftbytes_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -712,7 +713,7 @@ BB0_4: ` - shiftbytes_ptx_70 = ` + shiftbytes_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -790,7 +791,7 @@ BB0_4: ` - shiftbytes_ptx_75 = ` + shiftbytes_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -868,4 +869,4 @@ BB0_4: ` -) + ) diff --git a/cuda/shiftbytesy_wrapper.go b/cuda/shiftbytesy_wrapper.go index 358c0268a..20f839418 100644 --- a/cuda/shiftbytesy_wrapper.go +++ b/cuda/shiftbytesy_wrapper.go @@ -5,46 +5,46 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for shiftbytesy kernel var shiftbytesy_code cu.Function // Stores the arguments for shiftbytesy kernel invocation -type shiftbytesy_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_shy int - arg_clamp byte - argptr [7]unsafe.Pointer +type shiftbytesy_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shy int + arg_clamp byte + argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytesy kernel invocation var shiftbytesy_args shiftbytesy_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - shiftbytesy_args.argptr[0] = unsafe.Pointer(&shiftbytesy_args.arg_dst) - shiftbytesy_args.argptr[1] = unsafe.Pointer(&shiftbytesy_args.arg_src) - shiftbytesy_args.argptr[2] = unsafe.Pointer(&shiftbytesy_args.arg_Nx) - shiftbytesy_args.argptr[3] = unsafe.Pointer(&shiftbytesy_args.arg_Ny) - shiftbytesy_args.argptr[4] = unsafe.Pointer(&shiftbytesy_args.arg_Nz) - shiftbytesy_args.argptr[5] = unsafe.Pointer(&shiftbytesy_args.arg_shy) - shiftbytesy_args.argptr[6] = unsafe.Pointer(&shiftbytesy_args.arg_clamp) -} + shiftbytesy_args.argptr[0] = unsafe.Pointer(&shiftbytesy_args.arg_dst) + shiftbytesy_args.argptr[1] = unsafe.Pointer(&shiftbytesy_args.arg_src) + shiftbytesy_args.argptr[2] = unsafe.Pointer(&shiftbytesy_args.arg_Nx) + shiftbytesy_args.argptr[3] = unsafe.Pointer(&shiftbytesy_args.arg_Ny) + shiftbytesy_args.argptr[4] = unsafe.Pointer(&shiftbytesy_args.arg_Nz) + shiftbytesy_args.argptr[5] = unsafe.Pointer(&shiftbytesy_args.arg_shy) + shiftbytesy_args.argptr[6] = unsafe.Pointer(&shiftbytesy_args.arg_clamp) + } // Wrapper for shiftbytesy CUDA kernel, asynchronous. -func k_shiftbytesy_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clamp byte, cfg *config) { - if Synchronous { // debug +func k_shiftbytesy_async ( dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clamp byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("shiftbytesy") } @@ -52,43 +52,44 @@ func k_shiftbytesy_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, shiftbytesy_args.Lock() defer shiftbytesy_args.Unlock() - if shiftbytesy_code == 0 { + if shiftbytesy_code == 0{ shiftbytesy_code = fatbinLoad(shiftbytesy_map, "shiftbytesy") } - shiftbytesy_args.arg_dst = dst - shiftbytesy_args.arg_src = src - shiftbytesy_args.arg_Nx = Nx - shiftbytesy_args.arg_Ny = Ny - shiftbytesy_args.arg_Nz = Nz - shiftbytesy_args.arg_shy = shy - shiftbytesy_args.arg_clamp = clamp + shiftbytesy_args.arg_dst = dst + shiftbytesy_args.arg_src = src + shiftbytesy_args.arg_Nx = Nx + shiftbytesy_args.arg_Ny = Ny + shiftbytesy_args.arg_Nz = Nz + shiftbytesy_args.arg_shy = shy + shiftbytesy_args.arg_clamp = clamp + args := shiftbytesy_args.argptr[:] cu.LaunchKernel(shiftbytesy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("shiftbytesy") } } // maps compute capability on PTX code for shiftbytesy kernel. -var shiftbytesy_map = map[int]string{0: "", - 30: shiftbytesy_ptx_30, - 35: shiftbytesy_ptx_35, - 37: shiftbytesy_ptx_37, - 50: shiftbytesy_ptx_50, - 52: shiftbytesy_ptx_52, - 53: shiftbytesy_ptx_53, - 60: shiftbytesy_ptx_60, - 61: shiftbytesy_ptx_61, - 70: shiftbytesy_ptx_70, - 75: shiftbytesy_ptx_75} +var shiftbytesy_map = map[int]string{ 0: "" , +30: shiftbytesy_ptx_30 , +35: shiftbytesy_ptx_35 , +37: shiftbytesy_ptx_37 , +50: shiftbytesy_ptx_50 , +52: shiftbytesy_ptx_52 , +53: shiftbytesy_ptx_53 , +60: shiftbytesy_ptx_60 , +61: shiftbytesy_ptx_61 , +70: shiftbytesy_ptx_70 , +75: shiftbytesy_ptx_75 } // shiftbytesy PTX code for various compute capabilities. -const ( - shiftbytesy_ptx_30 = ` +const( + shiftbytesy_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -167,7 +168,7 @@ BB0_4: ` - shiftbytesy_ptx_35 = ` + shiftbytesy_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -246,7 +247,7 @@ BB0_4: ` - shiftbytesy_ptx_37 = ` + shiftbytesy_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -325,7 +326,7 @@ BB0_4: ` - shiftbytesy_ptx_50 = ` + shiftbytesy_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -404,7 +405,7 @@ BB0_4: ` - shiftbytesy_ptx_52 = ` + shiftbytesy_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -483,7 +484,7 @@ BB0_4: ` - shiftbytesy_ptx_53 = ` + shiftbytesy_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -562,7 +563,7 @@ BB0_4: ` - shiftbytesy_ptx_60 = ` + shiftbytesy_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -641,7 +642,7 @@ BB0_4: ` - shiftbytesy_ptx_61 = ` + shiftbytesy_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -720,7 +721,7 @@ BB0_4: ` - shiftbytesy_ptx_70 = ` + shiftbytesy_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -799,7 +800,7 @@ BB0_4: ` - shiftbytesy_ptx_75 = ` + shiftbytesy_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -878,4 +879,4 @@ BB0_4: ` -) + ) diff --git a/cuda/shiftx_wrapper.go b/cuda/shiftx_wrapper.go index 81fefc439..f7b15bbda 100644 --- a/cuda/shiftx_wrapper.go +++ b/cuda/shiftx_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for shiftx kernel var shiftx_code cu.Function // Stores the arguments for shiftx kernel invocation -type shiftx_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_shx int - arg_clampL float32 - arg_clampR float32 - argptr [8]unsafe.Pointer +type shiftx_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shx int + arg_clampL float32 + arg_clampR float32 + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftx kernel invocation var shiftx_args shiftx_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - shiftx_args.argptr[0] = unsafe.Pointer(&shiftx_args.arg_dst) - shiftx_args.argptr[1] = unsafe.Pointer(&shiftx_args.arg_src) - shiftx_args.argptr[2] = unsafe.Pointer(&shiftx_args.arg_Nx) - shiftx_args.argptr[3] = unsafe.Pointer(&shiftx_args.arg_Ny) - shiftx_args.argptr[4] = unsafe.Pointer(&shiftx_args.arg_Nz) - shiftx_args.argptr[5] = unsafe.Pointer(&shiftx_args.arg_shx) - shiftx_args.argptr[6] = unsafe.Pointer(&shiftx_args.arg_clampL) - shiftx_args.argptr[7] = unsafe.Pointer(&shiftx_args.arg_clampR) -} + shiftx_args.argptr[0] = unsafe.Pointer(&shiftx_args.arg_dst) + shiftx_args.argptr[1] = unsafe.Pointer(&shiftx_args.arg_src) + shiftx_args.argptr[2] = unsafe.Pointer(&shiftx_args.arg_Nx) + shiftx_args.argptr[3] = unsafe.Pointer(&shiftx_args.arg_Ny) + shiftx_args.argptr[4] = unsafe.Pointer(&shiftx_args.arg_Nz) + shiftx_args.argptr[5] = unsafe.Pointer(&shiftx_args.arg_shx) + shiftx_args.argptr[6] = unsafe.Pointer(&shiftx_args.arg_clampL) + shiftx_args.argptr[7] = unsafe.Pointer(&shiftx_args.arg_clampR) + } // Wrapper for shiftx CUDA kernel, asynchronous. -func k_shiftx_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { - if Synchronous { // debug +func k_shiftx_async ( dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("shiftx") } @@ -54,44 +54,45 @@ func k_shiftx_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz i shiftx_args.Lock() defer shiftx_args.Unlock() - if shiftx_code == 0 { + if shiftx_code == 0{ shiftx_code = fatbinLoad(shiftx_map, "shiftx") } - shiftx_args.arg_dst = dst - shiftx_args.arg_src = src - shiftx_args.arg_Nx = Nx - shiftx_args.arg_Ny = Ny - shiftx_args.arg_Nz = Nz - shiftx_args.arg_shx = shx - shiftx_args.arg_clampL = clampL - shiftx_args.arg_clampR = clampR + shiftx_args.arg_dst = dst + shiftx_args.arg_src = src + shiftx_args.arg_Nx = Nx + shiftx_args.arg_Ny = Ny + shiftx_args.arg_Nz = Nz + shiftx_args.arg_shx = shx + shiftx_args.arg_clampL = clampL + shiftx_args.arg_clampR = clampR + args := shiftx_args.argptr[:] cu.LaunchKernel(shiftx_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("shiftx") } } // maps compute capability on PTX code for shiftx kernel. -var shiftx_map = map[int]string{0: "", - 30: shiftx_ptx_30, - 35: shiftx_ptx_35, - 37: shiftx_ptx_37, - 50: shiftx_ptx_50, - 52: shiftx_ptx_52, - 53: shiftx_ptx_53, - 60: shiftx_ptx_60, - 61: shiftx_ptx_61, - 70: shiftx_ptx_70, - 75: shiftx_ptx_75} +var shiftx_map = map[int]string{ 0: "" , +30: shiftx_ptx_30 , +35: shiftx_ptx_35 , +37: shiftx_ptx_37 , +50: shiftx_ptx_50 , +52: shiftx_ptx_52 , +53: shiftx_ptx_53 , +60: shiftx_ptx_60 , +61: shiftx_ptx_61 , +70: shiftx_ptx_70 , +75: shiftx_ptx_75 } // shiftx PTX code for various compute capabilities. -const ( - shiftx_ptx_30 = ` +const( + shiftx_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -173,7 +174,7 @@ BB0_5: ` - shiftx_ptx_35 = ` + shiftx_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -255,7 +256,7 @@ BB0_5: ` - shiftx_ptx_37 = ` + shiftx_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -337,7 +338,7 @@ BB0_5: ` - shiftx_ptx_50 = ` + shiftx_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -419,7 +420,7 @@ BB0_5: ` - shiftx_ptx_52 = ` + shiftx_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -501,7 +502,7 @@ BB0_5: ` - shiftx_ptx_53 = ` + shiftx_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -583,7 +584,7 @@ BB0_5: ` - shiftx_ptx_60 = ` + shiftx_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -665,7 +666,7 @@ BB0_5: ` - shiftx_ptx_61 = ` + shiftx_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -747,7 +748,7 @@ BB0_5: ` - shiftx_ptx_70 = ` + shiftx_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -829,7 +830,7 @@ BB0_5: ` - shiftx_ptx_75 = ` + shiftx_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -911,4 +912,4 @@ BB0_5: ` -) + ) diff --git a/cuda/shifty_wrapper.go b/cuda/shifty_wrapper.go index 69454058a..717d8d8ab 100644 --- a/cuda/shifty_wrapper.go +++ b/cuda/shifty_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for shifty kernel var shifty_code cu.Function // Stores the arguments for shifty kernel invocation -type shifty_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_shy int - arg_clampL float32 - arg_clampR float32 - argptr [8]unsafe.Pointer +type shifty_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shy int + arg_clampL float32 + arg_clampR float32 + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shifty kernel invocation var shifty_args shifty_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - shifty_args.argptr[0] = unsafe.Pointer(&shifty_args.arg_dst) - shifty_args.argptr[1] = unsafe.Pointer(&shifty_args.arg_src) - shifty_args.argptr[2] = unsafe.Pointer(&shifty_args.arg_Nx) - shifty_args.argptr[3] = unsafe.Pointer(&shifty_args.arg_Ny) - shifty_args.argptr[4] = unsafe.Pointer(&shifty_args.arg_Nz) - shifty_args.argptr[5] = unsafe.Pointer(&shifty_args.arg_shy) - shifty_args.argptr[6] = unsafe.Pointer(&shifty_args.arg_clampL) - shifty_args.argptr[7] = unsafe.Pointer(&shifty_args.arg_clampR) -} + shifty_args.argptr[0] = unsafe.Pointer(&shifty_args.arg_dst) + shifty_args.argptr[1] = unsafe.Pointer(&shifty_args.arg_src) + shifty_args.argptr[2] = unsafe.Pointer(&shifty_args.arg_Nx) + shifty_args.argptr[3] = unsafe.Pointer(&shifty_args.arg_Ny) + shifty_args.argptr[4] = unsafe.Pointer(&shifty_args.arg_Nz) + shifty_args.argptr[5] = unsafe.Pointer(&shifty_args.arg_shy) + shifty_args.argptr[6] = unsafe.Pointer(&shifty_args.arg_clampL) + shifty_args.argptr[7] = unsafe.Pointer(&shifty_args.arg_clampR) + } // Wrapper for shifty CUDA kernel, asynchronous. -func k_shifty_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampL float32, clampR float32, cfg *config) { - if Synchronous { // debug +func k_shifty_async ( dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampL float32, clampR float32, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("shifty") } @@ -54,44 +54,45 @@ func k_shifty_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz i shifty_args.Lock() defer shifty_args.Unlock() - if shifty_code == 0 { + if shifty_code == 0{ shifty_code = fatbinLoad(shifty_map, "shifty") } - shifty_args.arg_dst = dst - shifty_args.arg_src = src - shifty_args.arg_Nx = Nx - shifty_args.arg_Ny = Ny - shifty_args.arg_Nz = Nz - shifty_args.arg_shy = shy - shifty_args.arg_clampL = clampL - shifty_args.arg_clampR = clampR + shifty_args.arg_dst = dst + shifty_args.arg_src = src + shifty_args.arg_Nx = Nx + shifty_args.arg_Ny = Ny + shifty_args.arg_Nz = Nz + shifty_args.arg_shy = shy + shifty_args.arg_clampL = clampL + shifty_args.arg_clampR = clampR + args := shifty_args.argptr[:] cu.LaunchKernel(shifty_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("shifty") } } // maps compute capability on PTX code for shifty kernel. -var shifty_map = map[int]string{0: "", - 30: shifty_ptx_30, - 35: shifty_ptx_35, - 37: shifty_ptx_37, - 50: shifty_ptx_50, - 52: shifty_ptx_52, - 53: shifty_ptx_53, - 60: shifty_ptx_60, - 61: shifty_ptx_61, - 70: shifty_ptx_70, - 75: shifty_ptx_75} +var shifty_map = map[int]string{ 0: "" , +30: shifty_ptx_30 , +35: shifty_ptx_35 , +37: shifty_ptx_37 , +50: shifty_ptx_50 , +52: shifty_ptx_52 , +53: shifty_ptx_53 , +60: shifty_ptx_60 , +61: shifty_ptx_61 , +70: shifty_ptx_70 , +75: shifty_ptx_75 } // shifty PTX code for various compute capabilities. -const ( - shifty_ptx_30 = ` +const( + shifty_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -173,7 +174,7 @@ BB0_5: ` - shifty_ptx_35 = ` + shifty_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -255,7 +256,7 @@ BB0_5: ` - shifty_ptx_37 = ` + shifty_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -337,7 +338,7 @@ BB0_5: ` - shifty_ptx_50 = ` + shifty_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -419,7 +420,7 @@ BB0_5: ` - shifty_ptx_52 = ` + shifty_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -501,7 +502,7 @@ BB0_5: ` - shifty_ptx_53 = ` + shifty_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -583,7 +584,7 @@ BB0_5: ` - shifty_ptx_60 = ` + shifty_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -665,7 +666,7 @@ BB0_5: ` - shifty_ptx_61 = ` + shifty_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -747,7 +748,7 @@ BB0_5: ` - shifty_ptx_70 = ` + shifty_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -829,7 +830,7 @@ BB0_5: ` - shifty_ptx_75 = ` + shifty_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -911,4 +912,4 @@ BB0_5: ` -) + ) diff --git a/cuda/shiftz_wrapper.go b/cuda/shiftz_wrapper.go index 51c307c42..43424b31a 100644 --- a/cuda/shiftz_wrapper.go +++ b/cuda/shiftz_wrapper.go @@ -5,48 +5,48 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for shiftz kernel var shiftz_code cu.Function // Stores the arguments for shiftz kernel invocation -type shiftz_args_t struct { - arg_dst unsafe.Pointer - arg_src unsafe.Pointer - arg_Nx int - arg_Ny int - arg_Nz int - arg_shz int - arg_clampL float32 - arg_clampR float32 - argptr [8]unsafe.Pointer +type shiftz_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shz int + arg_clampL float32 + arg_clampR float32 + argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftz kernel invocation var shiftz_args shiftz_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - shiftz_args.argptr[0] = unsafe.Pointer(&shiftz_args.arg_dst) - shiftz_args.argptr[1] = unsafe.Pointer(&shiftz_args.arg_src) - shiftz_args.argptr[2] = unsafe.Pointer(&shiftz_args.arg_Nx) - shiftz_args.argptr[3] = unsafe.Pointer(&shiftz_args.arg_Ny) - shiftz_args.argptr[4] = unsafe.Pointer(&shiftz_args.arg_Nz) - shiftz_args.argptr[5] = unsafe.Pointer(&shiftz_args.arg_shz) - shiftz_args.argptr[6] = unsafe.Pointer(&shiftz_args.arg_clampL) - shiftz_args.argptr[7] = unsafe.Pointer(&shiftz_args.arg_clampR) -} + shiftz_args.argptr[0] = unsafe.Pointer(&shiftz_args.arg_dst) + shiftz_args.argptr[1] = unsafe.Pointer(&shiftz_args.arg_src) + shiftz_args.argptr[2] = unsafe.Pointer(&shiftz_args.arg_Nx) + shiftz_args.argptr[3] = unsafe.Pointer(&shiftz_args.arg_Ny) + shiftz_args.argptr[4] = unsafe.Pointer(&shiftz_args.arg_Nz) + shiftz_args.argptr[5] = unsafe.Pointer(&shiftz_args.arg_shz) + shiftz_args.argptr[6] = unsafe.Pointer(&shiftz_args.arg_clampL) + shiftz_args.argptr[7] = unsafe.Pointer(&shiftz_args.arg_clampR) + } // Wrapper for shiftz CUDA kernel, asynchronous. -func k_shiftz_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shz int, clampL float32, clampR float32, cfg *config) { - if Synchronous { // debug +func k_shiftz_async ( dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shz int, clampL float32, clampR float32, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("shiftz") } @@ -54,44 +54,45 @@ func k_shiftz_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz i shiftz_args.Lock() defer shiftz_args.Unlock() - if shiftz_code == 0 { + if shiftz_code == 0{ shiftz_code = fatbinLoad(shiftz_map, "shiftz") } - shiftz_args.arg_dst = dst - shiftz_args.arg_src = src - shiftz_args.arg_Nx = Nx - shiftz_args.arg_Ny = Ny - shiftz_args.arg_Nz = Nz - shiftz_args.arg_shz = shz - shiftz_args.arg_clampL = clampL - shiftz_args.arg_clampR = clampR + shiftz_args.arg_dst = dst + shiftz_args.arg_src = src + shiftz_args.arg_Nx = Nx + shiftz_args.arg_Ny = Ny + shiftz_args.arg_Nz = Nz + shiftz_args.arg_shz = shz + shiftz_args.arg_clampL = clampL + shiftz_args.arg_clampR = clampR + args := shiftz_args.argptr[:] cu.LaunchKernel(shiftz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("shiftz") } } // maps compute capability on PTX code for shiftz kernel. -var shiftz_map = map[int]string{0: "", - 30: shiftz_ptx_30, - 35: shiftz_ptx_35, - 37: shiftz_ptx_37, - 50: shiftz_ptx_50, - 52: shiftz_ptx_52, - 53: shiftz_ptx_53, - 60: shiftz_ptx_60, - 61: shiftz_ptx_61, - 70: shiftz_ptx_70, - 75: shiftz_ptx_75} +var shiftz_map = map[int]string{ 0: "" , +30: shiftz_ptx_30 , +35: shiftz_ptx_35 , +37: shiftz_ptx_37 , +50: shiftz_ptx_50 , +52: shiftz_ptx_52 , +53: shiftz_ptx_53 , +60: shiftz_ptx_60 , +61: shiftz_ptx_61 , +70: shiftz_ptx_70 , +75: shiftz_ptx_75 } // shiftz PTX code for various compute capabilities. -const ( - shiftz_ptx_30 = ` +const( + shiftz_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -173,7 +174,7 @@ BB0_5: ` - shiftz_ptx_35 = ` + shiftz_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -255,7 +256,7 @@ BB0_5: ` - shiftz_ptx_37 = ` + shiftz_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -337,7 +338,7 @@ BB0_5: ` - shiftz_ptx_50 = ` + shiftz_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -419,7 +420,7 @@ BB0_5: ` - shiftz_ptx_52 = ` + shiftz_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -501,7 +502,7 @@ BB0_5: ` - shiftz_ptx_53 = ` + shiftz_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -583,7 +584,7 @@ BB0_5: ` - shiftz_ptx_60 = ` + shiftz_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -665,7 +666,7 @@ BB0_5: ` - shiftz_ptx_61 = ` + shiftz_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -747,7 +748,7 @@ BB0_5: ` - shiftz_ptx_70 = ` + shiftz_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -829,7 +830,7 @@ BB0_5: ` - shiftz_ptx_75 = ` + shiftz_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -911,4 +912,4 @@ BB0_5: ` -) + ) diff --git a/cuda/slonczewski2_wrapper.go b/cuda/slonczewski2_wrapper.go index 22136d13b..bd5d3c516 100644 --- a/cuda/slonczewski2_wrapper.go +++ b/cuda/slonczewski2_wrapper.go @@ -5,86 +5,86 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for addslonczewskitorque2 kernel var addslonczewskitorque2_code cu.Function // Stores the arguments for addslonczewskitorque2 kernel invocation -type addslonczewskitorque2_args_t struct { - arg_tx unsafe.Pointer - arg_ty unsafe.Pointer - arg_tz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_jz_ unsafe.Pointer - arg_jz_mul float32 - arg_px_ unsafe.Pointer - arg_px_mul float32 - arg_py_ unsafe.Pointer - arg_py_mul float32 - arg_pz_ unsafe.Pointer - arg_pz_mul float32 - arg_alpha_ unsafe.Pointer - arg_alpha_mul float32 - arg_pol_ unsafe.Pointer - arg_pol_mul float32 - arg_lambda_ unsafe.Pointer - arg_lambda_mul float32 - arg_epsPrime_ unsafe.Pointer - arg_epsPrime_mul float32 - arg_flt_ unsafe.Pointer - arg_flt_mul float32 - arg_N int - argptr [27]unsafe.Pointer +type addslonczewskitorque2_args_t struct{ + arg_tx unsafe.Pointer + arg_ty unsafe.Pointer + arg_tz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_jz_ unsafe.Pointer + arg_jz_mul float32 + arg_px_ unsafe.Pointer + arg_px_mul float32 + arg_py_ unsafe.Pointer + arg_py_mul float32 + arg_pz_ unsafe.Pointer + arg_pz_mul float32 + arg_alpha_ unsafe.Pointer + arg_alpha_mul float32 + arg_pol_ unsafe.Pointer + arg_pol_mul float32 + arg_lambda_ unsafe.Pointer + arg_lambda_mul float32 + arg_epsPrime_ unsafe.Pointer + arg_epsPrime_mul float32 + arg_flt_ unsafe.Pointer + arg_flt_mul float32 + arg_N int + argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addslonczewskitorque2 kernel invocation var addslonczewskitorque2_args addslonczewskitorque2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - addslonczewskitorque2_args.argptr[0] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tx) - addslonczewskitorque2_args.argptr[1] = unsafe.Pointer(&addslonczewskitorque2_args.arg_ty) - addslonczewskitorque2_args.argptr[2] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tz) - addslonczewskitorque2_args.argptr[3] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mx) - addslonczewskitorque2_args.argptr[4] = unsafe.Pointer(&addslonczewskitorque2_args.arg_my) - addslonczewskitorque2_args.argptr[5] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mz) - addslonczewskitorque2_args.argptr[6] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_) - addslonczewskitorque2_args.argptr[7] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_mul) - addslonczewskitorque2_args.argptr[8] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_) - addslonczewskitorque2_args.argptr[9] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_mul) - addslonczewskitorque2_args.argptr[10] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_) - addslonczewskitorque2_args.argptr[11] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_mul) - addslonczewskitorque2_args.argptr[12] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_) - addslonczewskitorque2_args.argptr[13] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_mul) - addslonczewskitorque2_args.argptr[14] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_) - addslonczewskitorque2_args.argptr[15] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_mul) - addslonczewskitorque2_args.argptr[16] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_) - addslonczewskitorque2_args.argptr[17] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_mul) - addslonczewskitorque2_args.argptr[18] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_) - addslonczewskitorque2_args.argptr[19] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_mul) - addslonczewskitorque2_args.argptr[20] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_) - addslonczewskitorque2_args.argptr[21] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_mul) - addslonczewskitorque2_args.argptr[22] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_) - addslonczewskitorque2_args.argptr[23] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_mul) - addslonczewskitorque2_args.argptr[24] = unsafe.Pointer(&addslonczewskitorque2_args.arg_flt_) - addslonczewskitorque2_args.argptr[25] = unsafe.Pointer(&addslonczewskitorque2_args.arg_flt_mul) - addslonczewskitorque2_args.argptr[26] = unsafe.Pointer(&addslonczewskitorque2_args.arg_N) -} + addslonczewskitorque2_args.argptr[0] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tx) + addslonczewskitorque2_args.argptr[1] = unsafe.Pointer(&addslonczewskitorque2_args.arg_ty) + addslonczewskitorque2_args.argptr[2] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tz) + addslonczewskitorque2_args.argptr[3] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mx) + addslonczewskitorque2_args.argptr[4] = unsafe.Pointer(&addslonczewskitorque2_args.arg_my) + addslonczewskitorque2_args.argptr[5] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mz) + addslonczewskitorque2_args.argptr[6] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_) + addslonczewskitorque2_args.argptr[7] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_mul) + addslonczewskitorque2_args.argptr[8] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_) + addslonczewskitorque2_args.argptr[9] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_mul) + addslonczewskitorque2_args.argptr[10] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_) + addslonczewskitorque2_args.argptr[11] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_mul) + addslonczewskitorque2_args.argptr[12] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_) + addslonczewskitorque2_args.argptr[13] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_mul) + addslonczewskitorque2_args.argptr[14] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_) + addslonczewskitorque2_args.argptr[15] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_mul) + addslonczewskitorque2_args.argptr[16] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_) + addslonczewskitorque2_args.argptr[17] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_mul) + addslonczewskitorque2_args.argptr[18] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_) + addslonczewskitorque2_args.argptr[19] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_mul) + addslonczewskitorque2_args.argptr[20] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_) + addslonczewskitorque2_args.argptr[21] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_mul) + addslonczewskitorque2_args.argptr[22] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_) + addslonczewskitorque2_args.argptr[23] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_mul) + addslonczewskitorque2_args.argptr[24] = unsafe.Pointer(&addslonczewskitorque2_args.arg_flt_) + addslonczewskitorque2_args.argptr[25] = unsafe.Pointer(&addslonczewskitorque2_args.arg_flt_mul) + addslonczewskitorque2_args.argptr[26] = unsafe.Pointer(&addslonczewskitorque2_args.arg_N) + } // Wrapper for addslonczewskitorque2 CUDA kernel, asynchronous. -func k_addslonczewskitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jz_ unsafe.Pointer, jz_mul float32, px_ unsafe.Pointer, px_mul float32, py_ unsafe.Pointer, py_mul float32, pz_ unsafe.Pointer, pz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, pol_ unsafe.Pointer, pol_mul float32, lambda_ unsafe.Pointer, lambda_mul float32, epsPrime_ unsafe.Pointer, epsPrime_mul float32, flt_ unsafe.Pointer, flt_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_addslonczewskitorque2_async ( tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jz_ unsafe.Pointer, jz_mul float32, px_ unsafe.Pointer, px_mul float32, py_ unsafe.Pointer, py_mul float32, pz_ unsafe.Pointer, pz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, pol_ unsafe.Pointer, pol_mul float32, lambda_ unsafe.Pointer, lambda_mul float32, epsPrime_ unsafe.Pointer, epsPrime_mul float32, flt_ unsafe.Pointer, flt_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("addslonczewskitorque2") } @@ -92,63 +92,64 @@ func k_addslonczewskitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsa addslonczewskitorque2_args.Lock() defer addslonczewskitorque2_args.Unlock() - if addslonczewskitorque2_code == 0 { + if addslonczewskitorque2_code == 0{ addslonczewskitorque2_code = fatbinLoad(addslonczewskitorque2_map, "addslonczewskitorque2") } - addslonczewskitorque2_args.arg_tx = tx - addslonczewskitorque2_args.arg_ty = ty - addslonczewskitorque2_args.arg_tz = tz - addslonczewskitorque2_args.arg_mx = mx - addslonczewskitorque2_args.arg_my = my - addslonczewskitorque2_args.arg_mz = mz - addslonczewskitorque2_args.arg_Ms_ = Ms_ - addslonczewskitorque2_args.arg_Ms_mul = Ms_mul - addslonczewskitorque2_args.arg_jz_ = jz_ - addslonczewskitorque2_args.arg_jz_mul = jz_mul - addslonczewskitorque2_args.arg_px_ = px_ - addslonczewskitorque2_args.arg_px_mul = px_mul - addslonczewskitorque2_args.arg_py_ = py_ - addslonczewskitorque2_args.arg_py_mul = py_mul - addslonczewskitorque2_args.arg_pz_ = pz_ - addslonczewskitorque2_args.arg_pz_mul = pz_mul - addslonczewskitorque2_args.arg_alpha_ = alpha_ - addslonczewskitorque2_args.arg_alpha_mul = alpha_mul - addslonczewskitorque2_args.arg_pol_ = pol_ - addslonczewskitorque2_args.arg_pol_mul = pol_mul - addslonczewskitorque2_args.arg_lambda_ = lambda_ - addslonczewskitorque2_args.arg_lambda_mul = lambda_mul - addslonczewskitorque2_args.arg_epsPrime_ = epsPrime_ - addslonczewskitorque2_args.arg_epsPrime_mul = epsPrime_mul - addslonczewskitorque2_args.arg_flt_ = flt_ - addslonczewskitorque2_args.arg_flt_mul = flt_mul - addslonczewskitorque2_args.arg_N = N + addslonczewskitorque2_args.arg_tx = tx + addslonczewskitorque2_args.arg_ty = ty + addslonczewskitorque2_args.arg_tz = tz + addslonczewskitorque2_args.arg_mx = mx + addslonczewskitorque2_args.arg_my = my + addslonczewskitorque2_args.arg_mz = mz + addslonczewskitorque2_args.arg_Ms_ = Ms_ + addslonczewskitorque2_args.arg_Ms_mul = Ms_mul + addslonczewskitorque2_args.arg_jz_ = jz_ + addslonczewskitorque2_args.arg_jz_mul = jz_mul + addslonczewskitorque2_args.arg_px_ = px_ + addslonczewskitorque2_args.arg_px_mul = px_mul + addslonczewskitorque2_args.arg_py_ = py_ + addslonczewskitorque2_args.arg_py_mul = py_mul + addslonczewskitorque2_args.arg_pz_ = pz_ + addslonczewskitorque2_args.arg_pz_mul = pz_mul + addslonczewskitorque2_args.arg_alpha_ = alpha_ + addslonczewskitorque2_args.arg_alpha_mul = alpha_mul + addslonczewskitorque2_args.arg_pol_ = pol_ + addslonczewskitorque2_args.arg_pol_mul = pol_mul + addslonczewskitorque2_args.arg_lambda_ = lambda_ + addslonczewskitorque2_args.arg_lambda_mul = lambda_mul + addslonczewskitorque2_args.arg_epsPrime_ = epsPrime_ + addslonczewskitorque2_args.arg_epsPrime_mul = epsPrime_mul + addslonczewskitorque2_args.arg_flt_ = flt_ + addslonczewskitorque2_args.arg_flt_mul = flt_mul + addslonczewskitorque2_args.arg_N = N + args := addslonczewskitorque2_args.argptr[:] cu.LaunchKernel(addslonczewskitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("addslonczewskitorque2") } } // maps compute capability on PTX code for addslonczewskitorque2 kernel. -var addslonczewskitorque2_map = map[int]string{0: "", - 30: addslonczewskitorque2_ptx_30, - 35: addslonczewskitorque2_ptx_35, - 37: addslonczewskitorque2_ptx_37, - 50: addslonczewskitorque2_ptx_50, - 52: addslonczewskitorque2_ptx_52, - 53: addslonczewskitorque2_ptx_53, - 60: addslonczewskitorque2_ptx_60, - 61: addslonczewskitorque2_ptx_61, - 70: addslonczewskitorque2_ptx_70, - 75: addslonczewskitorque2_ptx_75} +var addslonczewskitorque2_map = map[int]string{ 0: "" , +30: addslonczewskitorque2_ptx_30 , +35: addslonczewskitorque2_ptx_35 , +37: addslonczewskitorque2_ptx_37 , +50: addslonczewskitorque2_ptx_50 , +52: addslonczewskitorque2_ptx_52 , +53: addslonczewskitorque2_ptx_53 , +60: addslonczewskitorque2_ptx_60 , +61: addslonczewskitorque2_ptx_61 , +70: addslonczewskitorque2_ptx_70 , +75: addslonczewskitorque2_ptx_75 } // addslonczewskitorque2 PTX code for various compute capabilities. -const ( - addslonczewskitorque2_ptx_30 = ` +const( + addslonczewskitorque2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -417,7 +418,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_35 = ` + addslonczewskitorque2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -686,7 +687,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_37 = ` + addslonczewskitorque2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -955,7 +956,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_50 = ` + addslonczewskitorque2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -1224,7 +1225,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_52 = ` + addslonczewskitorque2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1493,7 +1494,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_53 = ` + addslonczewskitorque2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -1762,7 +1763,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_60 = ` + addslonczewskitorque2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -2031,7 +2032,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_61 = ` + addslonczewskitorque2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -2300,7 +2301,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_70 = ` + addslonczewskitorque2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -2569,7 +2570,7 @@ BB0_25: ` - addslonczewskitorque2_ptx_75 = ` + addslonczewskitorque2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -2838,4 +2839,4 @@ BB0_25: ` -) + ) diff --git a/cuda/temperature2_wrapper.go b/cuda/temperature2_wrapper.go index 0671ad689..bd6dc1d47 100644 --- a/cuda/temperature2_wrapper.go +++ b/cuda/temperature2_wrapper.go @@ -5,52 +5,52 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for settemperature2 kernel var settemperature2_code cu.Function // Stores the arguments for settemperature2 kernel invocation -type settemperature2_args_t struct { - arg_B unsafe.Pointer - arg_noise unsafe.Pointer - arg_kB2_VgammaDt float32 - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_temp_ unsafe.Pointer - arg_temp_mul float32 - arg_alpha_ unsafe.Pointer - arg_alpha_mul float32 - arg_N int - argptr [10]unsafe.Pointer +type settemperature2_args_t struct{ + arg_B unsafe.Pointer + arg_noise unsafe.Pointer + arg_kB2_VgammaDt float32 + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_temp_ unsafe.Pointer + arg_temp_mul float32 + arg_alpha_ unsafe.Pointer + arg_alpha_mul float32 + arg_N int + argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for settemperature2 kernel invocation var settemperature2_args settemperature2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - settemperature2_args.argptr[0] = unsafe.Pointer(&settemperature2_args.arg_B) - settemperature2_args.argptr[1] = unsafe.Pointer(&settemperature2_args.arg_noise) - settemperature2_args.argptr[2] = unsafe.Pointer(&settemperature2_args.arg_kB2_VgammaDt) - settemperature2_args.argptr[3] = unsafe.Pointer(&settemperature2_args.arg_Ms_) - settemperature2_args.argptr[4] = unsafe.Pointer(&settemperature2_args.arg_Ms_mul) - settemperature2_args.argptr[5] = unsafe.Pointer(&settemperature2_args.arg_temp_) - settemperature2_args.argptr[6] = unsafe.Pointer(&settemperature2_args.arg_temp_mul) - settemperature2_args.argptr[7] = unsafe.Pointer(&settemperature2_args.arg_alpha_) - settemperature2_args.argptr[8] = unsafe.Pointer(&settemperature2_args.arg_alpha_mul) - settemperature2_args.argptr[9] = unsafe.Pointer(&settemperature2_args.arg_N) -} + settemperature2_args.argptr[0] = unsafe.Pointer(&settemperature2_args.arg_B) + settemperature2_args.argptr[1] = unsafe.Pointer(&settemperature2_args.arg_noise) + settemperature2_args.argptr[2] = unsafe.Pointer(&settemperature2_args.arg_kB2_VgammaDt) + settemperature2_args.argptr[3] = unsafe.Pointer(&settemperature2_args.arg_Ms_) + settemperature2_args.argptr[4] = unsafe.Pointer(&settemperature2_args.arg_Ms_mul) + settemperature2_args.argptr[5] = unsafe.Pointer(&settemperature2_args.arg_temp_) + settemperature2_args.argptr[6] = unsafe.Pointer(&settemperature2_args.arg_temp_mul) + settemperature2_args.argptr[7] = unsafe.Pointer(&settemperature2_args.arg_alpha_) + settemperature2_args.argptr[8] = unsafe.Pointer(&settemperature2_args.arg_alpha_mul) + settemperature2_args.argptr[9] = unsafe.Pointer(&settemperature2_args.arg_N) + } // Wrapper for settemperature2 CUDA kernel, asynchronous. -func k_settemperature2_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, Ms_ unsafe.Pointer, Ms_mul float32, temp_ unsafe.Pointer, temp_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_settemperature2_async ( B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, Ms_ unsafe.Pointer, Ms_mul float32, temp_ unsafe.Pointer, temp_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("settemperature2") } @@ -58,46 +58,47 @@ func k_settemperature2_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaD settemperature2_args.Lock() defer settemperature2_args.Unlock() - if settemperature2_code == 0 { + if settemperature2_code == 0{ settemperature2_code = fatbinLoad(settemperature2_map, "settemperature2") } - settemperature2_args.arg_B = B - settemperature2_args.arg_noise = noise - settemperature2_args.arg_kB2_VgammaDt = kB2_VgammaDt - settemperature2_args.arg_Ms_ = Ms_ - settemperature2_args.arg_Ms_mul = Ms_mul - settemperature2_args.arg_temp_ = temp_ - settemperature2_args.arg_temp_mul = temp_mul - settemperature2_args.arg_alpha_ = alpha_ - settemperature2_args.arg_alpha_mul = alpha_mul - settemperature2_args.arg_N = N + settemperature2_args.arg_B = B + settemperature2_args.arg_noise = noise + settemperature2_args.arg_kB2_VgammaDt = kB2_VgammaDt + settemperature2_args.arg_Ms_ = Ms_ + settemperature2_args.arg_Ms_mul = Ms_mul + settemperature2_args.arg_temp_ = temp_ + settemperature2_args.arg_temp_mul = temp_mul + settemperature2_args.arg_alpha_ = alpha_ + settemperature2_args.arg_alpha_mul = alpha_mul + settemperature2_args.arg_N = N + args := settemperature2_args.argptr[:] cu.LaunchKernel(settemperature2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("settemperature2") } } // maps compute capability on PTX code for settemperature2 kernel. -var settemperature2_map = map[int]string{0: "", - 30: settemperature2_ptx_30, - 35: settemperature2_ptx_35, - 37: settemperature2_ptx_37, - 50: settemperature2_ptx_50, - 52: settemperature2_ptx_52, - 53: settemperature2_ptx_53, - 60: settemperature2_ptx_60, - 61: settemperature2_ptx_61, - 70: settemperature2_ptx_70, - 75: settemperature2_ptx_75} +var settemperature2_map = map[int]string{ 0: "" , +30: settemperature2_ptx_30 , +35: settemperature2_ptx_35 , +37: settemperature2_ptx_37 , +50: settemperature2_ptx_50 , +52: settemperature2_ptx_52 , +53: settemperature2_ptx_53 , +60: settemperature2_ptx_60 , +61: settemperature2_ptx_61 , +70: settemperature2_ptx_70 , +75: settemperature2_ptx_75 } // settemperature2 PTX code for various compute capabilities. -const ( - settemperature2_ptx_30 = ` +const( + settemperature2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -199,7 +200,7 @@ BB0_10: ` - settemperature2_ptx_35 = ` + settemperature2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -301,7 +302,7 @@ BB0_10: ` - settemperature2_ptx_37 = ` + settemperature2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -403,7 +404,7 @@ BB0_10: ` - settemperature2_ptx_50 = ` + settemperature2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -505,7 +506,7 @@ BB0_10: ` - settemperature2_ptx_52 = ` + settemperature2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -607,7 +608,7 @@ BB0_10: ` - settemperature2_ptx_53 = ` + settemperature2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -709,7 +710,7 @@ BB0_10: ` - settemperature2_ptx_60 = ` + settemperature2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -811,7 +812,7 @@ BB0_10: ` - settemperature2_ptx_61 = ` + settemperature2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -913,7 +914,7 @@ BB0_10: ` - settemperature2_ptx_70 = ` + settemperature2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1015,7 +1016,7 @@ BB0_10: ` - settemperature2_ptx_75 = ` + settemperature2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -1117,4 +1118,4 @@ BB0_10: ` -) + ) diff --git a/cuda/topologicalcharge_wrapper.go b/cuda/topologicalcharge_wrapper.go index e45b2d7b0..1dad5e453 100644 --- a/cuda/topologicalcharge_wrapper.go +++ b/cuda/topologicalcharge_wrapper.go @@ -5,50 +5,50 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for settopologicalcharge kernel var settopologicalcharge_code cu.Function // Stores the arguments for settopologicalcharge kernel invocation -type settopologicalcharge_args_t struct { - arg_s unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_icxcy float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [9]unsafe.Pointer +type settopologicalcharge_args_t struct{ + arg_s unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_icxcy float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for settopologicalcharge kernel invocation var settopologicalcharge_args settopologicalcharge_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - settopologicalcharge_args.argptr[0] = unsafe.Pointer(&settopologicalcharge_args.arg_s) - settopologicalcharge_args.argptr[1] = unsafe.Pointer(&settopologicalcharge_args.arg_mx) - settopologicalcharge_args.argptr[2] = unsafe.Pointer(&settopologicalcharge_args.arg_my) - settopologicalcharge_args.argptr[3] = unsafe.Pointer(&settopologicalcharge_args.arg_mz) - settopologicalcharge_args.argptr[4] = unsafe.Pointer(&settopologicalcharge_args.arg_icxcy) - settopologicalcharge_args.argptr[5] = unsafe.Pointer(&settopologicalcharge_args.arg_Nx) - settopologicalcharge_args.argptr[6] = unsafe.Pointer(&settopologicalcharge_args.arg_Ny) - settopologicalcharge_args.argptr[7] = unsafe.Pointer(&settopologicalcharge_args.arg_Nz) - settopologicalcharge_args.argptr[8] = unsafe.Pointer(&settopologicalcharge_args.arg_PBC) -} + settopologicalcharge_args.argptr[0] = unsafe.Pointer(&settopologicalcharge_args.arg_s) + settopologicalcharge_args.argptr[1] = unsafe.Pointer(&settopologicalcharge_args.arg_mx) + settopologicalcharge_args.argptr[2] = unsafe.Pointer(&settopologicalcharge_args.arg_my) + settopologicalcharge_args.argptr[3] = unsafe.Pointer(&settopologicalcharge_args.arg_mz) + settopologicalcharge_args.argptr[4] = unsafe.Pointer(&settopologicalcharge_args.arg_icxcy) + settopologicalcharge_args.argptr[5] = unsafe.Pointer(&settopologicalcharge_args.arg_Nx) + settopologicalcharge_args.argptr[6] = unsafe.Pointer(&settopologicalcharge_args.arg_Ny) + settopologicalcharge_args.argptr[7] = unsafe.Pointer(&settopologicalcharge_args.arg_Nz) + settopologicalcharge_args.argptr[8] = unsafe.Pointer(&settopologicalcharge_args.arg_PBC) + } // Wrapper for settopologicalcharge CUDA kernel, asynchronous. -func k_settopologicalcharge_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_settopologicalcharge_async ( s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("settopologicalcharge") } @@ -56,45 +56,46 @@ func k_settopologicalcharge_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe settopologicalcharge_args.Lock() defer settopologicalcharge_args.Unlock() - if settopologicalcharge_code == 0 { + if settopologicalcharge_code == 0{ settopologicalcharge_code = fatbinLoad(settopologicalcharge_map, "settopologicalcharge") } - settopologicalcharge_args.arg_s = s - settopologicalcharge_args.arg_mx = mx - settopologicalcharge_args.arg_my = my - settopologicalcharge_args.arg_mz = mz - settopologicalcharge_args.arg_icxcy = icxcy - settopologicalcharge_args.arg_Nx = Nx - settopologicalcharge_args.arg_Ny = Ny - settopologicalcharge_args.arg_Nz = Nz - settopologicalcharge_args.arg_PBC = PBC + settopologicalcharge_args.arg_s = s + settopologicalcharge_args.arg_mx = mx + settopologicalcharge_args.arg_my = my + settopologicalcharge_args.arg_mz = mz + settopologicalcharge_args.arg_icxcy = icxcy + settopologicalcharge_args.arg_Nx = Nx + settopologicalcharge_args.arg_Ny = Ny + settopologicalcharge_args.arg_Nz = Nz + settopologicalcharge_args.arg_PBC = PBC + args := settopologicalcharge_args.argptr[:] cu.LaunchKernel(settopologicalcharge_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("settopologicalcharge") } } // maps compute capability on PTX code for settopologicalcharge kernel. -var settopologicalcharge_map = map[int]string{0: "", - 30: settopologicalcharge_ptx_30, - 35: settopologicalcharge_ptx_35, - 37: settopologicalcharge_ptx_37, - 50: settopologicalcharge_ptx_50, - 52: settopologicalcharge_ptx_52, - 53: settopologicalcharge_ptx_53, - 60: settopologicalcharge_ptx_60, - 61: settopologicalcharge_ptx_61, - 70: settopologicalcharge_ptx_70, - 75: settopologicalcharge_ptx_75} +var settopologicalcharge_map = map[int]string{ 0: "" , +30: settopologicalcharge_ptx_30 , +35: settopologicalcharge_ptx_35 , +37: settopologicalcharge_ptx_37 , +50: settopologicalcharge_ptx_50 , +52: settopologicalcharge_ptx_52 , +53: settopologicalcharge_ptx_53 , +60: settopologicalcharge_ptx_60 , +61: settopologicalcharge_ptx_61 , +70: settopologicalcharge_ptx_70 , +75: settopologicalcharge_ptx_75 } // settopologicalcharge PTX code for various compute capabilities. -const ( - settopologicalcharge_ptx_30 = ` +const( + settopologicalcharge_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -704,7 +705,7 @@ BB0_72: ` - settopologicalcharge_ptx_35 = ` + settopologicalcharge_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1312,7 +1313,7 @@ BB0_72: ` - settopologicalcharge_ptx_37 = ` + settopologicalcharge_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1920,7 +1921,7 @@ BB0_72: ` - settopologicalcharge_ptx_50 = ` + settopologicalcharge_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -2528,7 +2529,7 @@ BB0_72: ` - settopologicalcharge_ptx_52 = ` + settopologicalcharge_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -3136,7 +3137,7 @@ BB0_72: ` - settopologicalcharge_ptx_53 = ` + settopologicalcharge_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -3744,7 +3745,7 @@ BB0_72: ` - settopologicalcharge_ptx_60 = ` + settopologicalcharge_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -4352,7 +4353,7 @@ BB0_72: ` - settopologicalcharge_ptx_61 = ` + settopologicalcharge_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -4960,7 +4961,7 @@ BB0_72: ` - settopologicalcharge_ptx_70 = ` + settopologicalcharge_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -5568,7 +5569,7 @@ BB0_72: ` - settopologicalcharge_ptx_75 = ` + settopologicalcharge_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -6176,4 +6177,4 @@ BB0_72: ` -) + ) diff --git a/cuda/uniaxialanisotropy2_wrapper.go b/cuda/uniaxialanisotropy2_wrapper.go index de44085b6..25c1f40a5 100644 --- a/cuda/uniaxialanisotropy2_wrapper.go +++ b/cuda/uniaxialanisotropy2_wrapper.go @@ -5,70 +5,70 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for adduniaxialanisotropy2 kernel var adduniaxialanisotropy2_code cu.Function // Stores the arguments for adduniaxialanisotropy2 kernel invocation -type adduniaxialanisotropy2_args_t struct { - arg_Bx unsafe.Pointer - arg_By unsafe.Pointer - arg_Bz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_K1_ unsafe.Pointer - arg_K1_mul float32 - arg_K2_ unsafe.Pointer - arg_K2_mul float32 - arg_ux_ unsafe.Pointer - arg_ux_mul float32 - arg_uy_ unsafe.Pointer - arg_uy_mul float32 - arg_uz_ unsafe.Pointer - arg_uz_mul float32 - arg_N int - argptr [19]unsafe.Pointer +type adduniaxialanisotropy2_args_t struct{ + arg_Bx unsafe.Pointer + arg_By unsafe.Pointer + arg_Bz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_K1_ unsafe.Pointer + arg_K1_mul float32 + arg_K2_ unsafe.Pointer + arg_K2_mul float32 + arg_ux_ unsafe.Pointer + arg_ux_mul float32 + arg_uy_ unsafe.Pointer + arg_uy_mul float32 + arg_uz_ unsafe.Pointer + arg_uz_mul float32 + arg_N int + argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adduniaxialanisotropy2 kernel invocation var adduniaxialanisotropy2_args adduniaxialanisotropy2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - adduniaxialanisotropy2_args.argptr[0] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bx) - adduniaxialanisotropy2_args.argptr[1] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_By) - adduniaxialanisotropy2_args.argptr[2] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bz) - adduniaxialanisotropy2_args.argptr[3] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mx) - adduniaxialanisotropy2_args.argptr[4] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_my) - adduniaxialanisotropy2_args.argptr[5] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mz) - adduniaxialanisotropy2_args.argptr[6] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_) - adduniaxialanisotropy2_args.argptr[7] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_mul) - adduniaxialanisotropy2_args.argptr[8] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_) - adduniaxialanisotropy2_args.argptr[9] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_mul) - adduniaxialanisotropy2_args.argptr[10] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_) - adduniaxialanisotropy2_args.argptr[11] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_mul) - adduniaxialanisotropy2_args.argptr[12] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_) - adduniaxialanisotropy2_args.argptr[13] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_mul) - adduniaxialanisotropy2_args.argptr[14] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_) - adduniaxialanisotropy2_args.argptr[15] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_mul) - adduniaxialanisotropy2_args.argptr[16] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_) - adduniaxialanisotropy2_args.argptr[17] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_mul) - adduniaxialanisotropy2_args.argptr[18] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_N) -} + adduniaxialanisotropy2_args.argptr[0] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bx) + adduniaxialanisotropy2_args.argptr[1] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_By) + adduniaxialanisotropy2_args.argptr[2] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bz) + adduniaxialanisotropy2_args.argptr[3] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mx) + adduniaxialanisotropy2_args.argptr[4] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_my) + adduniaxialanisotropy2_args.argptr[5] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mz) + adduniaxialanisotropy2_args.argptr[6] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_) + adduniaxialanisotropy2_args.argptr[7] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_mul) + adduniaxialanisotropy2_args.argptr[8] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_) + adduniaxialanisotropy2_args.argptr[9] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_mul) + adduniaxialanisotropy2_args.argptr[10] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_) + adduniaxialanisotropy2_args.argptr[11] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_mul) + adduniaxialanisotropy2_args.argptr[12] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_) + adduniaxialanisotropy2_args.argptr[13] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_mul) + adduniaxialanisotropy2_args.argptr[14] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_) + adduniaxialanisotropy2_args.argptr[15] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_mul) + adduniaxialanisotropy2_args.argptr[16] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_) + adduniaxialanisotropy2_args.argptr[17] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_mul) + adduniaxialanisotropy2_args.argptr[18] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_N) + } // Wrapper for adduniaxialanisotropy2 CUDA kernel, asynchronous. -func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) { - if Synchronous { // debug +func k_adduniaxialanisotropy2_async ( Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("adduniaxialanisotropy2") } @@ -76,55 +76,56 @@ func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz uns adduniaxialanisotropy2_args.Lock() defer adduniaxialanisotropy2_args.Unlock() - if adduniaxialanisotropy2_code == 0 { + if adduniaxialanisotropy2_code == 0{ adduniaxialanisotropy2_code = fatbinLoad(adduniaxialanisotropy2_map, "adduniaxialanisotropy2") } - adduniaxialanisotropy2_args.arg_Bx = Bx - adduniaxialanisotropy2_args.arg_By = By - adduniaxialanisotropy2_args.arg_Bz = Bz - adduniaxialanisotropy2_args.arg_mx = mx - adduniaxialanisotropy2_args.arg_my = my - adduniaxialanisotropy2_args.arg_mz = mz - adduniaxialanisotropy2_args.arg_Ms_ = Ms_ - adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul - adduniaxialanisotropy2_args.arg_K1_ = K1_ - adduniaxialanisotropy2_args.arg_K1_mul = K1_mul - adduniaxialanisotropy2_args.arg_K2_ = K2_ - adduniaxialanisotropy2_args.arg_K2_mul = K2_mul - adduniaxialanisotropy2_args.arg_ux_ = ux_ - adduniaxialanisotropy2_args.arg_ux_mul = ux_mul - adduniaxialanisotropy2_args.arg_uy_ = uy_ - adduniaxialanisotropy2_args.arg_uy_mul = uy_mul - adduniaxialanisotropy2_args.arg_uz_ = uz_ - adduniaxialanisotropy2_args.arg_uz_mul = uz_mul - adduniaxialanisotropy2_args.arg_N = N + adduniaxialanisotropy2_args.arg_Bx = Bx + adduniaxialanisotropy2_args.arg_By = By + adduniaxialanisotropy2_args.arg_Bz = Bz + adduniaxialanisotropy2_args.arg_mx = mx + adduniaxialanisotropy2_args.arg_my = my + adduniaxialanisotropy2_args.arg_mz = mz + adduniaxialanisotropy2_args.arg_Ms_ = Ms_ + adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul + adduniaxialanisotropy2_args.arg_K1_ = K1_ + adduniaxialanisotropy2_args.arg_K1_mul = K1_mul + adduniaxialanisotropy2_args.arg_K2_ = K2_ + adduniaxialanisotropy2_args.arg_K2_mul = K2_mul + adduniaxialanisotropy2_args.arg_ux_ = ux_ + adduniaxialanisotropy2_args.arg_ux_mul = ux_mul + adduniaxialanisotropy2_args.arg_uy_ = uy_ + adduniaxialanisotropy2_args.arg_uy_mul = uy_mul + adduniaxialanisotropy2_args.arg_uz_ = uz_ + adduniaxialanisotropy2_args.arg_uz_mul = uz_mul + adduniaxialanisotropy2_args.arg_N = N + args := adduniaxialanisotropy2_args.argptr[:] cu.LaunchKernel(adduniaxialanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("adduniaxialanisotropy2") } } // maps compute capability on PTX code for adduniaxialanisotropy2 kernel. -var adduniaxialanisotropy2_map = map[int]string{0: "", - 30: adduniaxialanisotropy2_ptx_30, - 35: adduniaxialanisotropy2_ptx_35, - 37: adduniaxialanisotropy2_ptx_37, - 50: adduniaxialanisotropy2_ptx_50, - 52: adduniaxialanisotropy2_ptx_52, - 53: adduniaxialanisotropy2_ptx_53, - 60: adduniaxialanisotropy2_ptx_60, - 61: adduniaxialanisotropy2_ptx_61, - 70: adduniaxialanisotropy2_ptx_70, - 75: adduniaxialanisotropy2_ptx_75} +var adduniaxialanisotropy2_map = map[int]string{ 0: "" , +30: adduniaxialanisotropy2_ptx_30 , +35: adduniaxialanisotropy2_ptx_35 , +37: adduniaxialanisotropy2_ptx_37 , +50: adduniaxialanisotropy2_ptx_50 , +52: adduniaxialanisotropy2_ptx_52 , +53: adduniaxialanisotropy2_ptx_53 , +60: adduniaxialanisotropy2_ptx_60 , +61: adduniaxialanisotropy2_ptx_61 , +70: adduniaxialanisotropy2_ptx_70 , +75: adduniaxialanisotropy2_ptx_75 } // adduniaxialanisotropy2 PTX code for various compute capabilities. -const ( - adduniaxialanisotropy2_ptx_30 = ` +const( + adduniaxialanisotropy2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -318,7 +319,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_35 = ` + adduniaxialanisotropy2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -512,7 +513,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_37 = ` + adduniaxialanisotropy2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -706,7 +707,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_50 = ` + adduniaxialanisotropy2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -900,7 +901,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_52 = ` + adduniaxialanisotropy2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -1094,7 +1095,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_53 = ` + adduniaxialanisotropy2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -1288,7 +1289,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_60 = ` + adduniaxialanisotropy2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -1482,7 +1483,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_61 = ` + adduniaxialanisotropy2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -1676,7 +1677,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_70 = ` + adduniaxialanisotropy2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -1870,7 +1871,7 @@ BB0_18: ` - adduniaxialanisotropy2_ptx_75 = ` + adduniaxialanisotropy2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -2064,4 +2065,4 @@ BB0_18: ` -) + ) diff --git a/cuda/zeromask_wrapper.go b/cuda/zeromask_wrapper.go index 646e2289b..923846500 100644 --- a/cuda/zeromask_wrapper.go +++ b/cuda/zeromask_wrapper.go @@ -5,40 +5,40 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for zeromask kernel var zeromask_code cu.Function // Stores the arguments for zeromask kernel invocation -type zeromask_args_t struct { - arg_dst unsafe.Pointer - arg_maskLUT unsafe.Pointer - arg_regions unsafe.Pointer - arg_N int - argptr [4]unsafe.Pointer +type zeromask_args_t struct{ + arg_dst unsafe.Pointer + arg_maskLUT unsafe.Pointer + arg_regions unsafe.Pointer + arg_N int + argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for zeromask kernel invocation var zeromask_args zeromask_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - zeromask_args.argptr[0] = unsafe.Pointer(&zeromask_args.arg_dst) - zeromask_args.argptr[1] = unsafe.Pointer(&zeromask_args.arg_maskLUT) - zeromask_args.argptr[2] = unsafe.Pointer(&zeromask_args.arg_regions) - zeromask_args.argptr[3] = unsafe.Pointer(&zeromask_args.arg_N) -} + zeromask_args.argptr[0] = unsafe.Pointer(&zeromask_args.arg_dst) + zeromask_args.argptr[1] = unsafe.Pointer(&zeromask_args.arg_maskLUT) + zeromask_args.argptr[2] = unsafe.Pointer(&zeromask_args.arg_regions) + zeromask_args.argptr[3] = unsafe.Pointer(&zeromask_args.arg_N) + } // Wrapper for zeromask CUDA kernel, asynchronous. -func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { - if Synchronous { // debug +func k_zeromask_async ( dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("zeromask") } @@ -46,40 +46,41 @@ func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe zeromask_args.Lock() defer zeromask_args.Unlock() - if zeromask_code == 0 { + if zeromask_code == 0{ zeromask_code = fatbinLoad(zeromask_map, "zeromask") } - zeromask_args.arg_dst = dst - zeromask_args.arg_maskLUT = maskLUT - zeromask_args.arg_regions = regions - zeromask_args.arg_N = N + zeromask_args.arg_dst = dst + zeromask_args.arg_maskLUT = maskLUT + zeromask_args.arg_regions = regions + zeromask_args.arg_N = N + args := zeromask_args.argptr[:] cu.LaunchKernel(zeromask_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("zeromask") } } // maps compute capability on PTX code for zeromask kernel. -var zeromask_map = map[int]string{0: "", - 30: zeromask_ptx_30, - 35: zeromask_ptx_35, - 37: zeromask_ptx_37, - 50: zeromask_ptx_50, - 52: zeromask_ptx_52, - 53: zeromask_ptx_53, - 60: zeromask_ptx_60, - 61: zeromask_ptx_61, - 70: zeromask_ptx_70, - 75: zeromask_ptx_75} +var zeromask_map = map[int]string{ 0: "" , +30: zeromask_ptx_30 , +35: zeromask_ptx_35 , +37: zeromask_ptx_37 , +50: zeromask_ptx_50 , +52: zeromask_ptx_52 , +53: zeromask_ptx_53 , +60: zeromask_ptx_60 , +61: zeromask_ptx_61 , +70: zeromask_ptx_70 , +75: zeromask_ptx_75 } // zeromask PTX code for various compute capabilities. -const ( - zeromask_ptx_30 = ` +const( + zeromask_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -136,7 +137,7 @@ BB0_3: ` - zeromask_ptx_35 = ` + zeromask_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -196,7 +197,7 @@ BB0_3: ` - zeromask_ptx_37 = ` + zeromask_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -256,7 +257,7 @@ BB0_3: ` - zeromask_ptx_50 = ` + zeromask_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -316,7 +317,7 @@ BB0_3: ` - zeromask_ptx_52 = ` + zeromask_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -376,7 +377,7 @@ BB0_3: ` - zeromask_ptx_53 = ` + zeromask_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -436,7 +437,7 @@ BB0_3: ` - zeromask_ptx_60 = ` + zeromask_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -496,7 +497,7 @@ BB0_3: ` - zeromask_ptx_61 = ` + zeromask_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -556,7 +557,7 @@ BB0_3: ` - zeromask_ptx_70 = ` + zeromask_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -616,7 +617,7 @@ BB0_3: ` - zeromask_ptx_75 = ` + zeromask_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -676,4 +677,4 @@ BB0_3: ` -) + ) diff --git a/cuda/zhangli2_wrapper.go b/cuda/zhangli2_wrapper.go index ece332e93..da0434a9a 100644 --- a/cuda/zhangli2_wrapper.go +++ b/cuda/zhangli2_wrapper.go @@ -5,86 +5,86 @@ package cuda EDITING IS FUTILE. */ -import ( +import( + "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" - "unsafe" ) // CUDA handle for addzhanglitorque2 kernel var addzhanglitorque2_code cu.Function // Stores the arguments for addzhanglitorque2 kernel invocation -type addzhanglitorque2_args_t struct { - arg_tx unsafe.Pointer - arg_ty unsafe.Pointer - arg_tz unsafe.Pointer - arg_mx unsafe.Pointer - arg_my unsafe.Pointer - arg_mz unsafe.Pointer - arg_Ms_ unsafe.Pointer - arg_Ms_mul float32 - arg_jx_ unsafe.Pointer - arg_jx_mul float32 - arg_jy_ unsafe.Pointer - arg_jy_mul float32 - arg_jz_ unsafe.Pointer - arg_jz_mul float32 - arg_alpha_ unsafe.Pointer - arg_alpha_mul float32 - arg_xi_ unsafe.Pointer - arg_xi_mul float32 - arg_pol_ unsafe.Pointer - arg_pol_mul float32 - arg_cx float32 - arg_cy float32 - arg_cz float32 - arg_Nx int - arg_Ny int - arg_Nz int - arg_PBC byte - argptr [27]unsafe.Pointer +type addzhanglitorque2_args_t struct{ + arg_tx unsafe.Pointer + arg_ty unsafe.Pointer + arg_tz unsafe.Pointer + arg_mx unsafe.Pointer + arg_my unsafe.Pointer + arg_mz unsafe.Pointer + arg_Ms_ unsafe.Pointer + arg_Ms_mul float32 + arg_jx_ unsafe.Pointer + arg_jx_mul float32 + arg_jy_ unsafe.Pointer + arg_jy_mul float32 + arg_jz_ unsafe.Pointer + arg_jz_mul float32 + arg_alpha_ unsafe.Pointer + arg_alpha_mul float32 + arg_xi_ unsafe.Pointer + arg_xi_mul float32 + arg_pol_ unsafe.Pointer + arg_pol_mul float32 + arg_cx float32 + arg_cy float32 + arg_cz float32 + arg_Nx int + arg_Ny int + arg_Nz int + arg_PBC byte + argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addzhanglitorque2 kernel invocation var addzhanglitorque2_args addzhanglitorque2_args_t -func init() { +func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. - addzhanglitorque2_args.argptr[0] = unsafe.Pointer(&addzhanglitorque2_args.arg_tx) - addzhanglitorque2_args.argptr[1] = unsafe.Pointer(&addzhanglitorque2_args.arg_ty) - addzhanglitorque2_args.argptr[2] = unsafe.Pointer(&addzhanglitorque2_args.arg_tz) - addzhanglitorque2_args.argptr[3] = unsafe.Pointer(&addzhanglitorque2_args.arg_mx) - addzhanglitorque2_args.argptr[4] = unsafe.Pointer(&addzhanglitorque2_args.arg_my) - addzhanglitorque2_args.argptr[5] = unsafe.Pointer(&addzhanglitorque2_args.arg_mz) - addzhanglitorque2_args.argptr[6] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_) - addzhanglitorque2_args.argptr[7] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_mul) - addzhanglitorque2_args.argptr[8] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_) - addzhanglitorque2_args.argptr[9] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_mul) - addzhanglitorque2_args.argptr[10] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_) - addzhanglitorque2_args.argptr[11] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_mul) - addzhanglitorque2_args.argptr[12] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_) - addzhanglitorque2_args.argptr[13] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_mul) - addzhanglitorque2_args.argptr[14] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_) - addzhanglitorque2_args.argptr[15] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_mul) - addzhanglitorque2_args.argptr[16] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_) - addzhanglitorque2_args.argptr[17] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_mul) - addzhanglitorque2_args.argptr[18] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_) - addzhanglitorque2_args.argptr[19] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_mul) - addzhanglitorque2_args.argptr[20] = unsafe.Pointer(&addzhanglitorque2_args.arg_cx) - addzhanglitorque2_args.argptr[21] = unsafe.Pointer(&addzhanglitorque2_args.arg_cy) - addzhanglitorque2_args.argptr[22] = unsafe.Pointer(&addzhanglitorque2_args.arg_cz) - addzhanglitorque2_args.argptr[23] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nx) - addzhanglitorque2_args.argptr[24] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ny) - addzhanglitorque2_args.argptr[25] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nz) - addzhanglitorque2_args.argptr[26] = unsafe.Pointer(&addzhanglitorque2_args.arg_PBC) -} + addzhanglitorque2_args.argptr[0] = unsafe.Pointer(&addzhanglitorque2_args.arg_tx) + addzhanglitorque2_args.argptr[1] = unsafe.Pointer(&addzhanglitorque2_args.arg_ty) + addzhanglitorque2_args.argptr[2] = unsafe.Pointer(&addzhanglitorque2_args.arg_tz) + addzhanglitorque2_args.argptr[3] = unsafe.Pointer(&addzhanglitorque2_args.arg_mx) + addzhanglitorque2_args.argptr[4] = unsafe.Pointer(&addzhanglitorque2_args.arg_my) + addzhanglitorque2_args.argptr[5] = unsafe.Pointer(&addzhanglitorque2_args.arg_mz) + addzhanglitorque2_args.argptr[6] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_) + addzhanglitorque2_args.argptr[7] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_mul) + addzhanglitorque2_args.argptr[8] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_) + addzhanglitorque2_args.argptr[9] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_mul) + addzhanglitorque2_args.argptr[10] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_) + addzhanglitorque2_args.argptr[11] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_mul) + addzhanglitorque2_args.argptr[12] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_) + addzhanglitorque2_args.argptr[13] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_mul) + addzhanglitorque2_args.argptr[14] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_) + addzhanglitorque2_args.argptr[15] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_mul) + addzhanglitorque2_args.argptr[16] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_) + addzhanglitorque2_args.argptr[17] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_mul) + addzhanglitorque2_args.argptr[18] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_) + addzhanglitorque2_args.argptr[19] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_mul) + addzhanglitorque2_args.argptr[20] = unsafe.Pointer(&addzhanglitorque2_args.arg_cx) + addzhanglitorque2_args.argptr[21] = unsafe.Pointer(&addzhanglitorque2_args.arg_cy) + addzhanglitorque2_args.argptr[22] = unsafe.Pointer(&addzhanglitorque2_args.arg_cz) + addzhanglitorque2_args.argptr[23] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nx) + addzhanglitorque2_args.argptr[24] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ny) + addzhanglitorque2_args.argptr[25] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nz) + addzhanglitorque2_args.argptr[26] = unsafe.Pointer(&addzhanglitorque2_args.arg_PBC) + } // Wrapper for addzhanglitorque2 CUDA kernel, asynchronous. -func k_addzhanglitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jx_ unsafe.Pointer, jx_mul float32, jy_ unsafe.Pointer, jy_mul float32, jz_ unsafe.Pointer, jz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, xi_ unsafe.Pointer, xi_mul float32, pol_ unsafe.Pointer, pol_mul float32, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { - if Synchronous { // debug +func k_addzhanglitorque2_async ( tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jx_ unsafe.Pointer, jx_mul float32, jy_ unsafe.Pointer, jy_mul float32, jz_ unsafe.Pointer, jz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, xi_ unsafe.Pointer, xi_mul float32, pol_ unsafe.Pointer, pol_mul float32, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { + if Synchronous{ // debug Sync() timer.Start("addzhanglitorque2") } @@ -92,63 +92,64 @@ func k_addzhanglitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.P addzhanglitorque2_args.Lock() defer addzhanglitorque2_args.Unlock() - if addzhanglitorque2_code == 0 { + if addzhanglitorque2_code == 0{ addzhanglitorque2_code = fatbinLoad(addzhanglitorque2_map, "addzhanglitorque2") } - addzhanglitorque2_args.arg_tx = tx - addzhanglitorque2_args.arg_ty = ty - addzhanglitorque2_args.arg_tz = tz - addzhanglitorque2_args.arg_mx = mx - addzhanglitorque2_args.arg_my = my - addzhanglitorque2_args.arg_mz = mz - addzhanglitorque2_args.arg_Ms_ = Ms_ - addzhanglitorque2_args.arg_Ms_mul = Ms_mul - addzhanglitorque2_args.arg_jx_ = jx_ - addzhanglitorque2_args.arg_jx_mul = jx_mul - addzhanglitorque2_args.arg_jy_ = jy_ - addzhanglitorque2_args.arg_jy_mul = jy_mul - addzhanglitorque2_args.arg_jz_ = jz_ - addzhanglitorque2_args.arg_jz_mul = jz_mul - addzhanglitorque2_args.arg_alpha_ = alpha_ - addzhanglitorque2_args.arg_alpha_mul = alpha_mul - addzhanglitorque2_args.arg_xi_ = xi_ - addzhanglitorque2_args.arg_xi_mul = xi_mul - addzhanglitorque2_args.arg_pol_ = pol_ - addzhanglitorque2_args.arg_pol_mul = pol_mul - addzhanglitorque2_args.arg_cx = cx - addzhanglitorque2_args.arg_cy = cy - addzhanglitorque2_args.arg_cz = cz - addzhanglitorque2_args.arg_Nx = Nx - addzhanglitorque2_args.arg_Ny = Ny - addzhanglitorque2_args.arg_Nz = Nz - addzhanglitorque2_args.arg_PBC = PBC + addzhanglitorque2_args.arg_tx = tx + addzhanglitorque2_args.arg_ty = ty + addzhanglitorque2_args.arg_tz = tz + addzhanglitorque2_args.arg_mx = mx + addzhanglitorque2_args.arg_my = my + addzhanglitorque2_args.arg_mz = mz + addzhanglitorque2_args.arg_Ms_ = Ms_ + addzhanglitorque2_args.arg_Ms_mul = Ms_mul + addzhanglitorque2_args.arg_jx_ = jx_ + addzhanglitorque2_args.arg_jx_mul = jx_mul + addzhanglitorque2_args.arg_jy_ = jy_ + addzhanglitorque2_args.arg_jy_mul = jy_mul + addzhanglitorque2_args.arg_jz_ = jz_ + addzhanglitorque2_args.arg_jz_mul = jz_mul + addzhanglitorque2_args.arg_alpha_ = alpha_ + addzhanglitorque2_args.arg_alpha_mul = alpha_mul + addzhanglitorque2_args.arg_xi_ = xi_ + addzhanglitorque2_args.arg_xi_mul = xi_mul + addzhanglitorque2_args.arg_pol_ = pol_ + addzhanglitorque2_args.arg_pol_mul = pol_mul + addzhanglitorque2_args.arg_cx = cx + addzhanglitorque2_args.arg_cy = cy + addzhanglitorque2_args.arg_cz = cz + addzhanglitorque2_args.arg_Nx = Nx + addzhanglitorque2_args.arg_Ny = Ny + addzhanglitorque2_args.arg_Nz = Nz + addzhanglitorque2_args.arg_PBC = PBC + args := addzhanglitorque2_args.argptr[:] cu.LaunchKernel(addzhanglitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) - if Synchronous { // debug + if Synchronous{ // debug Sync() timer.Stop("addzhanglitorque2") } } // maps compute capability on PTX code for addzhanglitorque2 kernel. -var addzhanglitorque2_map = map[int]string{0: "", - 30: addzhanglitorque2_ptx_30, - 35: addzhanglitorque2_ptx_35, - 37: addzhanglitorque2_ptx_37, - 50: addzhanglitorque2_ptx_50, - 52: addzhanglitorque2_ptx_52, - 53: addzhanglitorque2_ptx_53, - 60: addzhanglitorque2_ptx_60, - 61: addzhanglitorque2_ptx_61, - 70: addzhanglitorque2_ptx_70, - 75: addzhanglitorque2_ptx_75} +var addzhanglitorque2_map = map[int]string{ 0: "" , +30: addzhanglitorque2_ptx_30 , +35: addzhanglitorque2_ptx_35 , +37: addzhanglitorque2_ptx_37 , +50: addzhanglitorque2_ptx_50 , +52: addzhanglitorque2_ptx_52 , +53: addzhanglitorque2_ptx_53 , +60: addzhanglitorque2_ptx_60 , +61: addzhanglitorque2_ptx_61 , +70: addzhanglitorque2_ptx_70 , +75: addzhanglitorque2_ptx_75 } // addzhanglitorque2 PTX code for various compute capabilities. -const ( - addzhanglitorque2_ptx_30 = ` +const( + addzhanglitorque2_ptx_30 = ` .version 6.3 .target sm_30 .address_size 64 @@ -786,7 +787,7 @@ BB0_78: ` - addzhanglitorque2_ptx_35 = ` + addzhanglitorque2_ptx_35 = ` .version 6.3 .target sm_35 .address_size 64 @@ -1382,7 +1383,7 @@ BB0_78: ` - addzhanglitorque2_ptx_37 = ` + addzhanglitorque2_ptx_37 = ` .version 6.3 .target sm_37 .address_size 64 @@ -1978,7 +1979,7 @@ BB0_78: ` - addzhanglitorque2_ptx_50 = ` + addzhanglitorque2_ptx_50 = ` .version 6.3 .target sm_50 .address_size 64 @@ -2574,7 +2575,7 @@ BB0_78: ` - addzhanglitorque2_ptx_52 = ` + addzhanglitorque2_ptx_52 = ` .version 6.3 .target sm_52 .address_size 64 @@ -3170,7 +3171,7 @@ BB0_78: ` - addzhanglitorque2_ptx_53 = ` + addzhanglitorque2_ptx_53 = ` .version 6.3 .target sm_53 .address_size 64 @@ -3766,7 +3767,7 @@ BB0_78: ` - addzhanglitorque2_ptx_60 = ` + addzhanglitorque2_ptx_60 = ` .version 6.3 .target sm_60 .address_size 64 @@ -4362,7 +4363,7 @@ BB0_78: ` - addzhanglitorque2_ptx_61 = ` + addzhanglitorque2_ptx_61 = ` .version 6.3 .target sm_61 .address_size 64 @@ -4958,7 +4959,7 @@ BB0_78: ` - addzhanglitorque2_ptx_70 = ` + addzhanglitorque2_ptx_70 = ` .version 6.3 .target sm_70 .address_size 64 @@ -5554,7 +5555,7 @@ BB0_78: ` - addzhanglitorque2_ptx_75 = ` + addzhanglitorque2_ptx_75 = ` .version 6.3 .target sm_75 .address_size 64 @@ -6150,4 +6151,4 @@ BB0_78: ` -) + )