diff --git a/PLANNING.md b/PLANNING.md
index 7fd6f956..b4258d26 100644
--- a/PLANNING.md
+++ b/PLANNING.md
@@ -101,6 +101,14 @@ Other tracks are stretch goals, contributions towards them are accepted.
 - introduce batchAffine_vartime
 - Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes
 - Optimized elliptic curve directly calling assembly without ADX checks and limited input/output movement in registers or using function multi-versioning.
+- LLVM IR:
+  - use internal or private linkage type
+  - look into calling conventions like "fast" or "Tail fast"
+  - check if returning a value from function is propely optimized
+    compared to in-place result
+  - use readnone (pure) and readmem attribute for functions
+  - look into passing parameter as arrays instead of pointers?
+  - use hot function attribute
 
 ### User Experience track
 
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
index 12da5fcd..0db85007 100644
--- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
+++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
@@ -80,7 +80,7 @@ proc finalSubMayOverflowImpl*(
       ctx.mov scratch[i], a[i]
     ctx.sbb scratch[i], M[i]
 
-  # If it overflows here, it means that it was
+  # If it underflows here, it means that it was
   # smaller than the modulus and we don't need `scratch`
   ctx.sbb scratchReg, 0
 
diff --git a/constantine/math_compiler/README.md b/constantine/math_compiler/README.md
new file mode 100644
index 00000000..2ff81b44
--- /dev/null
+++ b/constantine/math_compiler/README.md
@@ -0,0 +1,83 @@
+# Cryptography primitive compiler
+
+This implements a cryptography compiler that can be used to produce
+- high-performance JIT code for GPUs
+- or assembly files, for CPUs when we want to ensure
+  there are no side-channel regressions for secret data
+- or vectorized assembly file, as LLVM IR is significantly
+  more convenient to model vector operation
+
+There are also LLVM IR => FPGA translators that might be useful
+in the future.
+
+## Platforms limitations
+
+- X86 cannot use dual carry-chain ADCX/ADOX easily.
+  - no native support for clearing a flag with `xor`
+    and keeping it clear.
+  - inline assembly cannot use the raw ASM printer.
+    so workflow will need to compile -> decompile.
+- Nvidia GPUs cannot lower types larger than 64-bit, hence we cannot use i256 for example.
+- AMD GPUs have a 1/4 throughput for i32 MUL compared to f32 MUL or i24 MUL
+- non-x86 targets may not be as optimized for matching
+  pattern for addcarry and subborrow, even with @llvm.usub.with.overflow
+
+## ABI
+
+Internal functions are:
+- prefixed with `_`
+- Linkage: internal
+- calling convention: "fast"
+- mark `hot` for field arithmetic functions
+
+Internal global constants are:
+- prefixed with `_`
+- Linkage: linkonce_odr (so they are merged with globals of the same name)
+
+External functions use default convention.
+
+We ensure parameters / return value fit in registers:
+- https://llvm.org/docs/Frontend/PerformanceTips.html
+
+TODO:
+- function alignment: look into
+  - https://www.bazhenov.me/posts/2024-02-performance-roulette/
+  - https://lkml.org/lkml/2015/5/21/443
+- function multiversioning
+- aggregate alignment (via datalayout)
+
+Naming convention for internal procedures:
+- _big_add_u64x4
+- _finalsub_mayo_u64x4 -> final substraction may overflow
+- _finalsub_noo_u64x4  -> final sub no overflow
+- _mod_add_u64x4
+- _mod_add2x_u64x8 -> FpDbl backend
+- _mty_mulur_u64x4b2 -> unreduced Montgomery multiplication (unreduced result valid iff 2 spare bits)
+- _mty_mul_u64x4b1  -> reduced Montgomery multiplication (result valid iff at least 1 spare bit)
+- _mty_mul_u64x4  -> reduced Montgomery multiplication
+- _mty_nsqrur_u64x4b2 -> unreduced square n times
+- _mty_nsqr_u64x4b1 -> reduced square n times
+- _mty_sqr_u64x4 -> square
+- _mty_red_u64x4 -> reduction u64x4 <- u64x8
+- _pmp_red_mayo_u64x4 -> Pseudo-Mersenne Prime partial reduction may overflow (secp256k1)
+- _pmp_red_noo_u64x4 -> Pseudo-Mersenne Prime partial reduction no overflow
+- _secp256k1_red -> special reduction
+- _fp2x_sqr2x_u64x4 -> Fp2 complex, Fp -> FpDbl lazy reduced squaring
+- _fp2g_sqr2x_u64x4 -> Fp2 generic/non-complex (do we pass the mul-non-residue as parameter?)
+- _fp2_sqr_u64x4 -> Fp2 (pass the mul-by-non-residue function as parameter)
+- _fp4o2_mulnr1pi_u64x4 -> Fp4 over Fp2 mul with (1+i) non-residue optimization
+- _fp4o2_mulbynr_u64x4
+- _fp12_add_u64x4
+- _fp12o4o2_mul_u64x4 -> Fp12 over Fp4 over Fp2
+- _ecg1swjac_adda0_u64x4 -> Shortweierstrass G1 jacobian addition a=0
+- _ecg1swjac_add_u64x4_var -> Shortweierstrass G1 jacobian vartime addition
+- _ectwprj_add_u64x4 -> Twisted Edwards Projective addition
+
+Vectorized:
+- _big_add_u64x4v4
+- _big_add_u32x8v8
+
+Naming for external procedures:
+- bls12_381_fp_add
+- bls12_381_fr_add
+- bls12_381_fp12_add
diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
index 19e92019..fdc4c393 100644
--- a/constantine/math_compiler/codegen_nvidia.nim
+++ b/constantine/math_compiler/codegen_nvidia.nim
@@ -9,12 +9,12 @@
 import
   constantine/platforms/abis/nvidia_abi {.all.},
   constantine/platforms/abis/c_abi,
-  constantine/platforms/llvm/[llvm, nvidia_inlineasm],
+  constantine/platforms/llvm/llvm,
   constantine/platforms/primitives,
   ./ir
 
 export
-  nvidia_abi, nvidia_inlineasm,
+  nvidia_abi,
   Flag, flag, wrapOpenArrayLenType
 
 # ############################################################
@@ -115,22 +115,6 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
 #
 # ############################################################
 
-proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
-  ## Tag a function as a Cuda Kernel, i.e. callable from host
-
-  doAssert fn.fnTy.getReturnType().isVoid(), block:
-    "Kernels must not return values but function returns " & $fn.fnTy.getReturnType().getTypeKind()
-
-  let ctx = module.getContext()
-  module.addNamedMetadataOperand(
-    "nvvm.annotations",
-    ctx.asValueRef(ctx.metadataNode([
-      fn.fnImpl.asMetadataRef(),
-      ctx.metadataNode("kernel"),
-      constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
-    ]))
-  )
-
 proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
   ## Create a public wrapper of a cuda device function
   ##
diff --git a/constantine/math_compiler/impl_fields_globals.nim b/constantine/math_compiler/impl_fields_globals.nim
new file mode 100644
index 00000000..faac591c
--- /dev/null
+++ b/constantine/math_compiler/impl_fields_globals.nim
@@ -0,0 +1,216 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/bithacks,
+  constantine/platforms/llvm/llvm,
+  constantine/serialization/[io_limbs, codecs],
+  constantine/named/deriv/precompute
+
+import ./ir
+
+# ############################################################
+#
+#                Metadata precomputation
+#
+# ############################################################
+
+# Constantine on CPU is configured at compile-time for several properties that need to be runtime configuration GPUs:
+# - word size (32-bit or 64-bit)
+# - curve properties access like modulus bitsize or -1/M[0] a.k.a. m0ninv
+# - constants are stored in freestanding `const`
+#
+# This is because it's not possible to store a BigInt[254] and a BigInt[384]
+# in a generic way in the same structure, especially without using heap allocation.
+# And with Nim's dead code elimination, unused curves are not compiled in.
+#
+# As there would be no easy way to dynamically retrieve (via an array or a table)
+#    const BLS12_381_modulus = ...
+#    const BN254_Snarks_modulus = ...
+#
+# - We would need a macro to properly access each constant.
+# - We would need to create a 32-bit and a 64-bit version.
+# - Unused curves would be compiled in the program.
+#
+# Note: on GPU we don't manipulate secrets hence branches and dynamic memory allocations are allowed.
+#
+# As GPU is a niche usage, instead we recreate the relevant `precompute` and IO procedures
+# with dynamic wordsize support.
+
+type
+  DynWord = uint32 or uint64
+  BigNum[T: DynWord] = object
+    bits: uint32
+    limbs: seq[T]
+
+# Serialization
+# ------------------------------------------------
+
+func byteLen(bits: SomeInteger): SomeInteger {.inline.} =
+  ## Length in bytes to serialize BigNum
+  (bits + 7) shr 3 # (bits + 8 - 1) div 8
+
+func fromHex[T](a: var BigNum[T], s: string) =
+   var bytes = newSeq[byte](a.bits.byteLen())
+   bytes.paddedFromHex(s, bigEndian)
+
+   # 2. Convert canonical uint to BigNum
+   const wordBitwidth = sizeof(T) * 8
+   a.limbs.unmarshal(bytes, wordBitwidth, bigEndian)
+
+func fromHex[T](BN: type BigNum[T], bits: uint32, s: string): BN =
+  const wordBitwidth = sizeof(T) * 8
+  let numWords = wordsRequired(bits, wordBitwidth)
+
+  result.bits = bits
+  result.limbs.setLen(numWords)
+  result.fromHex(s)
+
+func toHexLlvm*[T](a: BigNum[T]): string =
+  ## Conversion to big-endian hex suitable for LLVM literals
+  ## It MUST NOT have a prefix
+  ## This is variable-time
+  # 1. Convert BigInt to canonical uint
+  const wordBitwidth = sizeof(T) * 8
+  var bytes = newSeq[byte](byteLen(a.bits))
+  bytes.marshal(a.limbs, wordBitwidth, bigEndian)
+
+  # 2. Convert canonical uint to hex
+  const hexChars = "0123456789abcdef"
+  result = newString(2 * bytes.len)
+  for i in 0 ..< bytes.len:
+    let bi = bytes[i]
+    result[2*i] = hexChars[bi shr 4 and 0xF]
+    result[2*i+1] = hexChars[bi and 0xF]
+
+# Checks
+# ------------------------------------------------
+
+func checkValidModulus(M: BigNum) =
+  const wordBitwidth = uint32(BigNum.T.sizeof() * 8)
+  let expectedMsb = M.bits-1 - wordBitwidth * (M.limbs.len.uint32 - 1)
+  let msb = log2_vartime(M.limbs[M.limbs.len-1])
+
+  doAssert msb == expectedMsb, "Internal Error: the modulus must use all declared bits and only those:\n" &
+    "    Modulus '0x" & M.toHexLlvm() & "' is declared with " & $M.bits &
+    " bits but uses " & $(msb + wordBitwidth * uint32(M.limbs.len - 1)) & " bits."
+
+# Fields metadata
+# ------------------------------------------------
+
+func negInvModWord[T](M: BigNum[T]): T =
+  ## Returns the Montgomery domain magic constant for the input modulus:
+  ##
+  ##   µ ≡ -1/M[0] (mod SecretWord)
+  ##
+  ## M[0] is the least significant limb of M
+  ## M must be odd and greater than 2.
+  ##
+  ## Assuming 64-bit words:
+  ##
+  ## µ ≡ -1/M[0] (mod 2^64)
+  checkValidModulus(M)
+  return M.limbs[0].negInvModWord()
+
+# ############################################################
+#
+#                Globals in IR
+#
+# ############################################################
+
+proc getModulusPtr*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
+  let modname = fd.name & "_mod"
+  var M = asy.module.getGlobal(cstring modname)
+  if M.isNil():
+    M = asy.defineGlobalConstant(
+      name = modname,
+      section = fd.name,
+      constIntOfStringAndSize(fd.intBufTy, fd.modulus, 16),
+      fd.intBufTy,
+      alignment = 64
+    )
+  return M
+
+proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
+  let m0ninvname = fd.name & "_m0ninv"
+  var m0ninv = asy.module.getGlobal(cstring m0ninvname)
+  if m0ninv.isNil():
+    if fd.w == 32:
+      let M = BigNum[uint32].fromHex(fd.bits, fd.modulus)
+      m0ninv = asy.defineGlobalConstant(
+        name = m0ninvname,
+        section = fd.name,
+        constInt(fd.wordTy, M.negInvModWord()),
+        fd.wordTy
+      )
+    else:
+      let M = BigNum[uint64].fromHex(fd.bits, fd.modulus)
+      m0ninv = asy.defineGlobalConstant(
+        name = m0ninvname,
+        section = fd.name,
+        constInt(fd.wordTy, M.negInvModWord()),
+        fd.wordTy
+      )
+
+
+  return m0ninv
+
+when isMainModule:
+  let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux)
+  let fd = asy.ctx.configureField(
+    "bls12_381_fp",
+    381,
+    "1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
+    v = 1, w = 64)
+
+  discard asy.getModulusPtr(fd)
+  discard asy.getM0ninv(fd)
+
+  echo "========================================="
+  echo "LLVM IR\n"
+
+  echo asy.module
+  echo "========================================="
+
+  asy.module.verify(AbortProcessAction)
+
+  # --------------------------------------------
+  # See the assembly - note it might be different from what the JIT compiler did
+  initializeFullNativeTarget()
+
+  const triple = "x86_64-pc-linux-gnu"
+
+  let machine = createTargetMachine(
+    target = toTarget(triple),
+    triple = triple,
+    cpu = "",
+    features = "adx,bmi2", # TODO check the proper way to pass options
+    level = CodeGenLevelAggressive,
+    reloc = RelocDefault,
+    codeModel = CodeModelDefault
+  )
+
+  let pbo = createPassBuilderOptions()
+  let err = asy.module.runPasses(
+    "default<O3>,function-attrs,memcpyopt,sroa,mem2reg,gvn,dse,instcombine,inline,adce",
+    machine,
+    pbo
+  )
+  if not err.pointer().isNil():
+    writeStackTrace()
+    let errMsg = err.getErrorMessage()
+    stderr.write("\"codegenX86_64\" for module '" & astToStr(module) & "' " & $instantiationInfo() &
+                 " exited with error: " & $cstring(errMsg) & '\n')
+    errMsg.dispose()
+    quit 1
+
+  echo "========================================="
+  echo "Assembly\n"
+
+  echo machine.emitTo[:string](asy.module, AssemblyFile)
+  echo "========================================="
diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim
index 0ffbb5b1..5843d02d 100644
--- a/constantine/math_compiler/impl_fields_nvidia.nim
+++ b/constantine/math_compiler/impl_fields_nvidia.nim
@@ -7,8 +7,8 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../platforms/llvm/llvm,
-  ./ir, ./codegen_nvidia
+  constantine/platforms/llvm/[llvm, asm_nvidia],
+  ./ir
 
 # ############################################################
 #
@@ -40,8 +40,13 @@ import
 # but the carry codegen of madc.hi.cc.u64 has off-by-one
 # - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
 # - old 32-bit bug: https://forums.developer.nvidia.com/t/wrong-result-returned-by-madc-hi-u64-ptx-instruction-for-specific-operands/196094
+#
+# See instruction throughput
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions
+#
+# We cannot use i256 on Nvidia target: https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L244-L276
 
-proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -74,7 +79,7 @@ proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field,
   for i in 0 ..< N:
     r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
 
-proc finalSubNoOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -165,8 +170,8 @@ proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
   let t = bld.makeArray(fieldTy)
   let N = cm.getNumWords(field)
   let zero = case cm.wordSize
-             of size32: constInt(asy.i32_t, 0)
-             of size64: constInt(asy.i64_t, 0)
+             of w32: constInt(asy.i32_t, 0)
+             of w64: constInt(asy.i64_t, 0)
 
   t[0] = bld.sub_bo(a[0], b[0])
   for i in 1 ..< N:
@@ -258,8 +263,8 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field:
   let m0ninv = ValueRef cm.getMontgomeryNegInverse0(field)
   let M = (seq[ValueRef])(cm.getModulus(field))
   let zero = case cm.wordSize
-             of size32: constInt(asy.i32_t, 0)
-             of size64: constInt(asy.i64_t, 0)
+             of w32: constInt(asy.i32_t, 0)
+             of w64: constInt(asy.i64_t, 0)
 
   for i in 0 ..< N:
     # Multiplication
@@ -354,4 +359,4 @@ proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field:
 proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
-  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
\ No newline at end of file
+  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
diff --git a/constantine/math_compiler/impl_fields_sat.nim b/constantine/math_compiler/impl_fields_sat.nim
new file mode 100644
index 00000000..cd52f96f
--- /dev/null
+++ b/constantine/math_compiler/impl_fields_sat.nim
@@ -0,0 +1,153 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/[llvm, super_instructions],
+  ./ir
+
+# ############################################################
+#
+#             Field arithmetic with saturated limbs
+#
+# ############################################################
+#
+# This implements field operations in pure LLVM
+# using saturated limbs, i.e. 64-bit words on 64-bit platforms.
+#
+# This relies on hardware addition-with-carry and substraction-with-borrow
+# for efficiency.
+#
+# As such it is not suitable for platforms with no carry flags such as:
+# - WASM
+# - MIPS
+# - RISC-V
+# - Metal
+#
+# It may be suitable for Intel GPUs as the virtual ISA does support add-carry
+#
+# It is (theoretically) suitable for:
+# - ARM
+# - AMD GPUs
+#
+# The following backends have better optimizations through assembly:
+# - x86: access to ADOX and ADCX interleaved double-carry chain
+# - Nvidia: access to multiply accumulate instruction
+#           and non-interleaved double-carry chain
+#
+# Hardware limitations
+# --------------------
+#
+# AMD GPUs may benefits from using 24-bit limbs
+# - https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/programmer-references/AMD_OpenCL_Programming_Optimization_Guide2.pdf
+#   p2-23:
+#  Generally, the throughput and latency for 32-bit integer operations is the same
+#  as for single-precision floating point operations.
+#  24-bit integer MULs and MADs have four times the throughput of 32-bit integer
+#  multiplies. 24-bit signed and unsigned integers are natively supported on the
+#  GCN family of devices. The use of OpenCL built-in functions for mul24 and mad24
+#  is encouraged. Note that mul24 can be useful for array indexing operations
+#  Doc from 2015, it might not apply to RDNA family
+# - https://free.eol.cn/edu_net/edudown/AMDppt/OpenCL%20Programming%20and%20Optimization%20-%20Part%20I.pdf
+#   slide 24
+#
+# - https://chipsandcheese.com/2023/01/07/microbenchmarking-amds-rdna-3-graphics-architecture/
+#   "Since Turing, Nvidia also achieves very good integer multiplication performance.
+#    Integer multiplication appears to be extremely rare in shader code,
+#    and AMD doesn’t seem to have optimized for it.
+#    32-bit integer multiplication executes at around a quarter of FP32 rate,
+#    and latency is pretty high too."
+#
+# Software limitations
+# --------------------
+#
+# Unfortunately implementing unrolled using word size is fraught with perils
+# for add-carry / sub-borrow
+# AMDGPU crash: https://github.com/llvm/llvm-project/issues/102058
+# ARM64 missed optim: https://github.com/llvm/llvm-project/issues/102062
+#
+# and while using @llvm.usub.with.overflow.i64 allows ARM64 to solve the missing optimization
+# it is also missed on AMDGPU (or nvidia)
+#
+# And implementing them with i256 / i384 is similarly tricky
+# https://github.com/llvm/llvm-project/issues/102868
+
+const SectionName = "ctt.fields"
+
+proc finalSubMayOverflow*(asy: Assembler_LLVM, fd: FieldDescriptor, rr, a, M, carry: ValueRef) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ##
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ##
+  ## To be used when the final substraction can
+  ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
+
+  # Mask: contains 0xFFFF or 0x0000
+  let (_, mask) = asy.br.subborrow(fd.zero, fd.zero, carry)
+
+  # Now substract the modulus, and test a < M
+  # (underflow) with the last borrow
+  let (borrow, a_minus_M) = asy.br.llvm_sub_overflow(a, M)
+
+  # If it underflows here, it means that it was
+  # smaller than the modulus and we don't need `a-M`
+  let (ctl, _) = asy.br.subborrow(mask, fd.zero, borrow)
+
+  let t = asy.br.select(ctl, a, a_minus_M)
+  asy.store(rr, t)
+
+proc finalSubNoOverflow*(asy: Assembler_LLVM, fd: FieldDescriptor, rr, a, M: ValueRef) =
+  ## If a >= Modulus: r <- a-M
+  ## else:            r <- a
+  ##
+  ## This is constant-time straightline code.
+  ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+  ##
+  ## To be used when the modulus does not use the full bitwidth of the storing words
+  ## (say using 255 bits for the modulus out of 256 available in words)
+
+  # Now substract the modulus, and test a < M
+  # (underflow) with the last borrow
+  let (borrow, a_minus_M) = asy.br.llvm_sub_overflow(a, M)
+
+  # If it underflows here, it means that it was
+  # smaller than the modulus and we don't need `a-M`
+  let t = asy.br.select(borrow, a, a_minus_M)
+  asy.store(rr, t)
+
+proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  ## Generate an optimized modular addition kernel
+  ## with parameters `a, b, modulus: Limbs -> Limbs`
+
+  let red = if fd.spareBits >= 1: "noo"
+            else: "mayo"
+  let name = "_modadd_" & red & ".u" & $fd.w & "x" & $fd.numWords
+  asy.llvmInternalFnDef(
+          name, SectionName,
+          asy.void_t, toTypes([r, a, b, M]),
+          {kHot}):
+
+    tagParameter(1, "sret")
+
+    let (rr, aa, bb, MM) = llvmParams
+
+    # Pointers are opaque in LLVM now
+    let a = asy.load2(fd.intBufTy, aa, "a")
+    let b = asy.load2(fd.intBufTy, bb, "b")
+    let M = asy.load2(fd.intBufTy, MM, "M")
+
+    let (carry, apb) = asy.br.llvm_add_overflow(a, b)
+    if fd.spareBits >= 1:
+      asy.finalSubNoOverflow(fd, rr, apb, M)
+    else:
+      asy.finalSubMayOverflow(fd, rr, apb, M, carry)
+
+    asy.br.retVoid()
+
+  asy.callFn(name, [r, a, b, M])
diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim
index 1523fdab..ab80e036 100644
--- a/constantine/math_compiler/ir.nim
+++ b/constantine/math_compiler/ir.nim
@@ -7,12 +7,9 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/named/algebras,
-  constantine/named/deriv/precompute,
-  constantine/math/io/io_bigints,
-  constantine/platforms/[primitives, bithacks],
-  constantine/platforms/llvm/llvm,
-  constantine/serialization/[endians, codecs, io_limbs]
+  constantine/platforms/bithacks,
+  constantine/platforms/llvm/[llvm, super_instructions],
+  std/tables
 
 # ############################################################
 #
@@ -21,280 +18,198 @@ import
 # ############################################################
 
 type
+  AttrKind* = enum
+    # Other important properties like
+    # - norecurse
+    # - memory side-effects memory(argmem: readwrtite)
+    #   can be deduced.
+    kHot,
+    kInline,
+    kAlwaysInline,
+    kNoInline
+
   Assembler_LLVM* = ref object
-    # LLVM
     ctx*: ContextRef
     module*: ModuleRef
-    builder*: BuilderRef
-    i1_t*, i32_t*, i64_t*, i128_t*, void_t*: TypeRef
+    br*: BuilderRef
+    datalayout: TargetDataRef
+    psize*: int32
+    publicCC: CallingConvention
     backend*: Backend
+    byteOrder: ByteOrder
+
+    # It doesn't seem possible to retrieve a function type
+    # from its value, so we store them here.
+    # If we store the type we might as well store the impl
+    # and we store whether it's internal to apply the fastcc calling convention
+    fns: Table[string, tuple[ty: TypeRef, impl: ValueRef, internal: bool]]
+    attrs: array[AttrKind, AttributeRef]
+
+    # Convenience
+    void_t*: TypeRef
 
   Backend* = enum
+    bkAmdGpu
     bkNvidiaPTX
     bkX86_64_Linux
 
-  FnDef* = tuple[fnTy: TypeRef, fnImpl: ValueRef]
-    # calling getTypeOf on a ValueRef function
-    # loses type information like return value type or arity
-
 proc finalizeAssemblerLLVM(asy: Assembler_LLVM) =
   if not asy.isNil:
-    asy.builder.dispose()
+    asy.br.dispose()
     asy.module.dispose()
     asy.ctx.dispose()
+    # asy.datalayout.dispose() # unnecessary when module is cleared
+
+proc configure(asy: var Assembler_LLVM, backend: Backend) =
+  case backend
+  of bkAmdGpu:
+    asy.module.setTarget("amdgcn-amd-amdhsa")
+
+    const datalayout1 {.used.} =
+        "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-"               &
+              "i64:64-"                                                                 &
+              "v16:16-v24:32-"                                                          &
+              "v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-" &
+              "n32:64-S32-A5-G1-ni:7"
+
+    const datalayout2 =
+        "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-" &
+              "i64:64-"                                                                                &
+              "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-"  &
+              "n32:64-S32-A5-G1-ni:7:8"
+
+    asy.module.setDataLayout(datalayout2)
+
+  of bkNvidiaPTX:
+    asy.module.setTarget("nvptx64-nvidia-cuda")
+    # Datalayout for NVVM IR 1.8 (CUDA 11.6)
+    asy.module.setDataLayout(
+      "e-" & "p:64:64:64-" &
+      "i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" &
+      "f32:32:32-f64:64:64-" &
+      "v16:16:16-v32:32:32-v64:64:64-v128:128:128-" &
+      "n16:32:64")
+  of bkX86_64_Linux:
+    asy.module.setTarget("x86_64-pc-linux-gnu")
+
+  asy.datalayout = asy.module.getDataLayout()
+  asy.psize = int32 asy.datalayout.getPointerSize()
+  asy.backend = backend
+  asy.byteOrder = asy.dataLayout.getEndianness()
 
 proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assembler_LLVM =
   new result, finalizeAssemblerLLVM
   result.ctx = createContext()
   result.module = result.ctx.createModule(moduleName)
+  result.br = result.ctx.createBuilder()
+  result.datalayout = result.module.getDataLayout()
 
-  case backend
-  of bkNvidiaPTX:
-    result.module.setTarget("nvptx64-nvidia-cuda")
-    # Datalayout for NVVM IR 1.8 (CUDA 11.6)
-    result.module.setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64")
-  of bkX86_64_Linux:
-    {.warning : "The x86 LLVM backend is incomplete and for research purposes only".}
-    result.module.setTarget("x86_64-pc-linux-gnu")
-
-  result.builder = result.ctx.createBuilder()
-  result.i1_t = result.ctx.int1_t()
-  result.i32_t = result.ctx.int32_t()
-  result.i64_t = result.ctx.int64_t()
-  result.i128_t = result.ctx.int128_t()
   result.void_t = result.ctx.void_t()
-  result.backend = backend
+
+  result.configure(backend)
+
+  result.attrs[kHot] = result.ctx.createAttr("hot")
+  result.attrs[kInline] = result.ctx.createAttr("inlinehint")
+  result.attrs[kAlwaysInline] = result.ctx.createAttr("alwaysinline")
+  result.attrs[kNoInline] = result.ctx.createAttr("noinline")
+  result.attrs[kNoInline] = result.ctx.createAttr("sret")
 
 # ############################################################
 #
-#                Metadata precomputation
+#                     Syntax Sugar
 #
 # ############################################################
 
-# Constantine on CPU is configured at compile-time for several properties that need to be runtime configuration GPUs:
-# - word size (32-bit or 64-bit)
-# - curve properties access like modulus bitsize or -1/M[0] a.k.a. m0ninv
-# - constants are stored in freestanding `const`
-#
-# This is because it's not possible to store a BigInt[254] and a BigInt[384]
-# in a generic way in the same structure, especially without using heap allocation.
-# And with Nim's dead code elimination, unused curves are not compiled in.
-#
-# As there would be no easy way to dynamically retrieve (via an array or a table)
-#    const BLS12_381_modulus = ...
-#    const BN254_Snarks_modulus = ...
-#
-# - We would need a macro to properly access each constant.
-# - We would need to create a 32-bit and a 64-bit version.
-# - Unused curves would be compiled in the program.
-#
-# Note: on GPU we don't manipulate secrets hence branches and dynamic memory allocations are allowed.
-#
-# As GPU is a niche usage, instead we recreate the relevant `precompute` and IO procedures
-# with dynamic wordsize support.
-
-type
-  DynWord = uint32 or uint64
-  BigNum[T: DynWord] = object
-    bits: uint32
-    limbs: seq[T]
+func i1*(asy: Assembler_LLVM, v: SomeInteger): ValueRef =
+  constInt(asy.ctx.int1_t(), v)
 
-# Serialization
-# ------------------------------------------------
+func i32*(asy: Assembler_LLVM, v: SomeInteger): ValueRef =
+  constInt(asy.ctx.int32_t(), v)
 
-func byteLen(bits: SomeInteger): SomeInteger {.inline.} =
-  ## Length in bytes to serialize BigNum
-  (bits + 7) shr 3 # (bits + 8 - 1) div 8
+# ############################################################
+#
+#               Intermediate Representation
+#
+# ############################################################
 
-func wordsRequired(bits, wordBitwidth: SomeInteger): SomeInteger {.inline.} =
+func wordsRequired*(bits, wordBitwidth: SomeInteger): SomeInteger {.inline.} =
   ## Compute the number of limbs required
   ## from the announced bit length
 
-  debug: doAssert wordBitwidth == 32 or wordBitwidth == 64        # Power of 2
+  doAssert wordBitwidth == 32 or wordBitwidth == 64               # Power of 2
   (bits + wordBitwidth - 1) shr log2_vartime(uint32 wordBitwidth) # 5x to 55x faster than dividing by wordBitwidth
 
-func fromHex[T](a: var BigNum[T], s: string) =
-   var bytes = newSeq[byte](a.bits.byteLen())
-   bytes.paddedFromHex(s, bigEndian)
-
-   # 2. Convert canonical uint to BigNum
-   const wordBitwidth = sizeof(T) * 8
-   a.limbs.unmarshal(bytes, wordBitwidth, bigEndian)
-
-func fromHex[T](BN: type BigNum[T], bits: uint32, s: string): BN =
-  const wordBitwidth = sizeof(T) * 8
-  let numWords = wordsRequired(bits, wordBitwidth)
-
-  result.bits = bits
-  result.limbs.setLen(numWords)
-  result.fromHex(s)
+type
+  FieldDescriptor* = object
+    name*: string
+    modulus*: string # Modulus as Big-Endian uppercase hex, NOT prefixed with 0x
+    # primeKind*: PrimeKind
+
+    # Word: i32, i64 but can also be v4i32, v16i32 ...
+    wordTy*: TypeRef
+    word2xTy*: TypeRef # Double the word size
+    v*, w*: uint32
+    numWords*: uint32
+    zero*, zero_i1*: ValueRef
+    intBufTy*: TypeRef # int type, multiple of the word size, that can store the field elements
+                       # For example a 381 bit field is stored in 384-bit ints (whether on 32 or 64-bit platforms)
+
+    # Field metadata
+    fieldTy*: TypeRef
+    bits*: uint32
+    spareBits*: uint8
 
-func toHex[T](a: BigNum[T]): string =
-  ## Conversion to big-endian hex
-  ## This is variable-time
-  # 1. Convert BigInt to canonical uint
-  const wordBitwidth = sizeof(T) * 8
-  var bytes = newSeq[byte](byteLen(a.bits))
-  bytes.marshal(a.limbs, wordBitwidth, bigEndian)
+proc configureField*(ctx: ContextRef,
+      name: string,
+      modBits: int, modulus: string,
+      v, w: int): FieldDescriptor =
+  ## Configure a field descriptor with:
+  ## - v: vector length
+  ## - w: base word size in bits
+  ## - a `modulus` of bitsize `modBits`
+  ##
+  ## - Name is a prefix for example
+  ##   `mycurve_fp_`
 
-  # 2 Convert canonical uint to hex
-  return bytes.toHex()
+  let v = uint32 v
+  let w = uint32 w
+  let modBits = uint32 modBits
 
-# Checks
-# ------------------------------------------------
+  result.name = name
+  result.modulus = modulus
 
-func checkValidModulus(M: BigNum) =
-  const wordBitwidth = uint32(BigNum.T.sizeof() * 8)
-  let expectedMsb = M.bits-1 - wordBitwidth * (M.limbs.len.uint32 - 1)
-  let msb = log2_vartime(M.limbs[M.limbs.len-1])
+  doAssert v == 1, "At the moment SIMD vectorization is not supported."
+  result.v = v
+  result.w = w
 
-  doAssert msb == expectedMsb, "Internal Error: the modulus must use all declared bits and only those:\n" &
-    "    Modulus '" & M.toHex() & "' is declared with " & $M.bits &
-    " bits but uses " & $(msb + wordBitwidth * uint32(M.limbs.len - 1)) & " bits."
+  result.numWords = wordsRequired(modBits, w)
+  result.wordTy = ctx.int_t(w)
+  result.word2xTy = ctx.int_t(w+w)
+  result.zero = constInt(result.wordTy, 0)
+  result.zero_i1 = constInt(ctx.int1_t(), 0)
 
-# Fields metadata
-# ------------------------------------------------
+  let next_multiple_wordsize = result.numWords * w
+  result.intBufTy = ctx.int_t(next_multiple_wordsize)
 
-func negInvModWord[T](M: BigNum[T]): T =
-  ## Returns the Montgomery domain magic constant for the input modulus:
-  ##
-  ##   µ ≡ -1/M[0] (mod SecretWord)
-  ##
-  ## M[0] is the least significant limb of M
-  ## M must be odd and greater than 2.
-  ##
-  ## Assuming 64-bit words:
-  ##
-  ## µ ≡ -1/M[0] (mod 2^64)
-  checkValidModulus(M)
-  return M.limbs[0].negInvModWord()
+  result.fieldTy = array_t(result.wordTy, result.numWords)
+  result.bits = modBits
+  result.spareBits = uint8(next_multiple_wordsize - modBits)
 
-# ############################################################
-#
-#               Intermediate Representation
-#
-# ############################################################
+proc definePrimitives*(asy: Assembler_LLVM, fd: FieldDescriptor) =
+  asy.ctx.def_llvm_add_overflow(asy.module, fd.wordTy)
+  asy.ctx.def_llvm_add_overflow(asy.module, fd.intBufTy)
+  asy.ctx.def_llvm_sub_overflow(asy.module, fd.wordTy)
+  asy.ctx.def_llvm_sub_overflow(asy.module, fd.intBufTy)
 
-type
-  WordSize* = enum
-    size32
-    size64
-
-  Field* = enum
-    fp
-    fr
-
-  FieldConst* = object
-    wordTy: TypeRef
-    fieldTy: TypeRef
-    modulus*: seq[ConstValueRef]
-    m0ninv*: ConstValueRef
-    bits*: uint32
-    spareBits*: uint8
+  asy.ctx.def_addcarry(asy.module, asy.ctx.int1_t(), fd.wordTy)
+  asy.ctx.def_subborrow(asy.module, asy.ctx.int1_t(), fd.wordTy)
 
-  CurveMetadata* = object
-    curve*: Algebra
-    prefix*: string
-    wordSize*: WordSize
-    fp*: FieldConst
-    fr*: FieldConst
-
-  Opcode* = enum
-    opFpAdd = "fp_add"
-    opFrAdd = "fr_add"
-    opFpSub = "fp_sub"
-    opFrSub = "fr_sub"
-    opFpMul = "fp_mul"
-    opFrMul = "fr_mul"
-    opFpMulSkipFinalSub = "fp_mul_skip_final_sub"
-    opFrMulSkipFinalSub = "fr_mul_skip_final_sub"
-
-proc setFieldConst(fc: var FieldConst, ctx: ContextRef, wordSize: WordSize, modBits: uint32, modulus: string) =
-  let wordTy = case wordSize
-    of size32: ctx.int32_t()
-    of size64: ctx.int64_t()
-
-  let wordBitwidth = case wordSize
-    of size32: 32'u32
-    of size64: 64'u32
-
-  let numWords = wordsRequired(modBits, wordBitwidth)
-
-  fc.wordTy = wordTy
-  fc.fieldTy = array_t(wordTy, numWords)
-
-  case wordSize
-  of size32:
-    let m = BigNum[uint32].fromHex(modBits, modulus)
-    fc.modulus.setlen(m.limbs.len)
-    for i in 0 ..< m.limbs.len:
-      fc.modulus[i] = ctx.int32_t().constInt(m.limbs[i])
-
-    fc.m0ninv = ctx.int32_t().constInt(m.negInvModWord())
-
-  of size64:
-    let m = BigNum[uint64].fromHex(modBits, modulus)
-    fc.modulus.setlen(m.limbs.len)
-    for i in 0 ..< m.limbs.len:
-      fc.modulus[i] = ctx.int64_t().constInt(m.limbs[i])
-
-    fc.m0ninv = ctx.int64_t().constInt(m.negInvModWord())
-
-  debug: doAssert numWords == fc.modulus.len.uint32
-  fc.bits = modBits
-  fc.spareBits = uint8(numWords*wordBitwidth - modBits)
-
-proc init*(
-       C: type CurveMetadata, ctx: ContextRef,
-       prefix: string, wordSize: WordSize,
-       fpBits: uint32, fpMod: string,
-       frBits: uint32, frMod: string): CurveMetadata =
-
-  result = C(prefix: prefix, wordSize: wordSize)
-  result.fp.setFieldConst(ctx, wordSize, fpBits, fpMod)
-  result.fr.setFieldConst(ctx, wordSize, frBits, frMod)
-
-proc genSymbol*(cm: CurveMetadata, opcode: Opcode): string {.inline.} =
-  cm.prefix &
-    (if cm.wordSize == size32: "32b_" else: "64b_") &
-    $opcode
-
-func getFieldType*(cm: CurveMetadata, field: Field): TypeRef {.inline.} =
-  if field == fp:
-    return cm.fp.fieldTy
-  else:
-    return cm.fr.fieldTy
-
-func getNumWords*(cm: CurveMetadata, field: Field): int {.inline.} =
-  case field
-  of fp:
-    return cm.fp.modulus.len
-  of fr:
-    return cm.fr.modulus.len
-
-func getModulus*(cm: CurveMetadata, field: Field): lent seq[ConstValueRef] {.inline.} =
-  case field
-  of fp:
-    return cm.fp.modulus
-  of fr:
-    return cm.fr.modulus
-
-func getMontgomeryNegInverse0*(cm: CurveMetadata, field: Field): lent ConstValueRef {.inline.} =
-  case field
-  of fp:
-    return cm.fp.m0ninv
-  of fr:
-    return cm.fr.m0ninv
-
-func getSpareBits*(cm: CurveMetadata, field: Field): uint8 {.inline.} =
-  if field == fp:
-    return cm.fp.sparebits
-  else:
-    return cm.fr.sparebits
+proc wordTy*(fd: FieldDescriptor, value: SomeInteger) =
+  constInt(fd.wordTy, value)
 
 # ############################################################
 #
-#                    Syntax Sugar
+#                    Aggregate Types
 #
 # ############################################################
 
@@ -309,34 +224,34 @@ func getSpareBits*(cm: CurveMetadata, field: Field): uint8 {.inline.} =
 type
   Array* = object
     builder: BuilderRef
-    p: ValueRef
+    buf*: ValueRef
     arrayTy: TypeRef
     elemTy: TypeRef
     int32_t: TypeRef
 
-proc asArray*(builder: BuilderRef, arrayPtr: ValueRef, arrayTy: TypeRef): Array =
+proc asArray*(asy: Assembler_LLVM, arrayPtr: ValueRef, arrayTy: TypeRef): Array =
   Array(
-    builder: builder,
-    p: arrayPtr,
+    builder: asy.br,
+    buf: arrayPtr,
     arrayTy: arrayTy,
     elemTy: arrayTy.getElementType(),
     int32_t: arrayTy.getContext().int32_t()
   )
 
-proc makeArray*(builder: BuilderRef, arrayTy: TypeRef): Array =
+proc makeArray*(asy: Assembler_LLVM, arrayTy: TypeRef): Array =
   Array(
-    builder: builder,
-    p: builder.alloca(arrayTy),
+    builder: asy.br,
+    buf: asy.br.alloca(arrayTy),
     arrayTy: arrayTy,
     elemTy: arrayTy.getElementType(),
     int32_t: arrayTy.getContext().int32_t()
   )
 
-proc makeArray*(builder: BuilderRef, elemTy: TypeRef, len: uint32): Array =
+proc makeArray*(asy: Assembler_LLVM, elemTy: TypeRef, len: uint32): Array =
   let arrayTy = array_t(elemTy, len)
   Array(
-    builder: builder,
-    p: builder.alloca(arrayTy),
+    builder: asy.br,
+    buf: asy.br.alloca(arrayTy),
     arrayTy: arrayTy,
     elemTy: elemTy,
     int32_t: arrayTy.getContext().int32_t()
@@ -344,13 +259,276 @@ proc makeArray*(builder: BuilderRef, elemTy: TypeRef, len: uint32): Array =
 
 proc `[]`*(a: Array, index: SomeInteger): ValueRef {.inline.}=
   # First dereference the array pointer with 0, then access the `index`
-  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
   a.builder.load2(a.elemTy, pelem)
 
 proc `[]=`*(a: Array, index: SomeInteger, val: ValueRef) {.inline.}=
-  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.p, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
   a.builder.store(val, pelem)
 
-proc store*(builder: BuilderRef, dst: Array, src: Array) {.inline.}=
-  let v = builder.load2(src.arrayTy, src.p)
-  builder.store(v, dst.p)
+proc store*(asy: Assembler_LLVM, dst: Array, src: Array) {.inline.}=
+  let v = asy.br.load2(src.arrayTy, src.buf)
+  asy.br.store(v, dst.buf)
+
+proc store*(asy: Assembler_LLVM, dst: Array, src: ValueRef) {.inline.}=
+  ## Heterogeneous store of i256 into 4xuint64
+  doAssert asy.byteOrder == kLittleEndian
+  asy.br.store(src, dst.buf)
+
+# Conversion to native LLVM int
+# -------------------------------
+
+proc asLlvmIntPtr*(asy: Assembler_LLVM, a: Array): ValueRef =
+  doAssert asy.byteOrder == kLittleEndian, "Only little-endian machines are supported at the moment."
+  let bits = asy.datalayout.getSizeInBits(a.arrayTy)
+  let pInt = pointer_t(asy.ctx.int_t(uint32 bits))
+  asy.br.bitcast(a.buf, pInt)
+
+proc asLlvmIntPtr*(asy: Assembler_LLVM, v: ValueRef, ty: TypeRef): ValueRef =
+  doAssert asy.byteOrder == kLittleEndian, "Only little-endian machines are supported at the moment."
+  let pInt = pointer_t(ty)
+  asy.br.bitcast(v, pInt)
+
+# ############################################################
+#
+#                       Globals
+#
+# ############################################################
+
+proc loadGlobal*(asy: Assembler_LLVM, name: string): ValueRef =
+  let g = asy.module.getGlobal(cstring name)
+  doAssert not result.isNil(), "The global '" & name & "' has not been declared in the module"
+  let ty = result.getTypeOf()
+  return asy.br.load2(ty, g, name = "g")
+
+proc defineGlobalConstant*(
+      asy: Assembler_LLVM,
+      name, section: string,
+      value: ValueRef,
+      ty: TypeRef, alignment = -1): ValueRef =
+  ## Declare a global constant
+  ## name: The name of the constant
+  ## section: globals are kept near each other in memory to improve locality
+  ##          and avoid page-faults
+  ## an alignment of -1 leaves it at default for the ISA.
+  ## Otherwise configure the alignment in bytes.
+  ##
+  ## Return a pointer to the global
+  let g = asy.module.addGlobal(ty, cstring name)
+  g.setGlobal(value)
+  if alignment > 0:
+    g.setAlignment(cuint alignment)
+  # We intentionally keep globals internal:
+  # - for performance, this may avoids a translation table,
+  #   they also can be inlined.
+  # - for forward-compatibility, for example to expose the modulus
+  #   a function can handle non-matching in internal representation
+  #   for example if we want to have different endianness of words on bigEndian machine.
+  # g.setLinkage(linkInternal)
+  g.setImmutable()
+
+  # Group related globals in the same section
+  # This doesn't prevent globals from being optimized away
+  # if they are fully inlined or unused.
+  # This has the following benefits:
+  # - They might all be loaded in memory if they share a cacheline
+  # - If a section is unused, it can be garbage collected by the linker
+  g.setSection(cstring("ctt." & section & ".constants"))
+  return g
+
+# ############################################################
+#
+#                  ISA configuration
+#
+# ############################################################
+
+proc tagCudaKernel(asy: Assembler_LLVM, fn: ValueRef) =
+  ## Tag a function as a Cuda Kernel, i.e. callable from host
+
+  let returnTy = fn.getTypeOf().getReturnType()
+  doAssert returnTy.isVoid(), block:
+    "Kernels must not return values but function returns " & $returnTy.getTypeKind()
+
+  asy.module.addNamedMetadataOperand(
+    "nvvm.annotations",
+    asy.ctx.asValueRef(asy.ctx.metadataNode([
+      fn.asMetadataRef(),
+      asy.ctx.metadataNode("kernel"),
+      asy.i32(1).asMetadataRef()
+    ]))
+  )
+
+proc setPublic(asy: Assembler_LLVM, fn: ValueRef) =
+  case asy.backend
+  of bkAmdGpu: fn.setFnCallConv(AMDGPU_KERNEL)
+  of bkNvidiaPtx: asy.tagCudaKernel(fn)
+  else: discard
+
+# ############################################################
+#
+#        Function Definition and calling convention
+#
+# ############################################################
+
+# Most architectures can pass up to 4 or 6 arguments directly into registers
+# And we allow LLVM to use the best calling convention possible with "Fast".
+#
+# Recommendation:
+#   https://llvm.org/docs/Frontend/PerformanceTips.html
+#
+#   Avoid creating values of aggregate types (i.e. structs and arrays).
+#   In particular, avoid loading and storing them,
+#   or manipulating them with insertvalue and extractvalue instructions.
+#   Instead, only load and store individual fields of the aggregate.
+#
+#   There are some exceptions to this rule:
+#   - It is fine to use values of aggregate type in global variable initializers.
+#   - It is fine to return structs, if this is done to represent the return of multiple values in registers.
+#   - It is fine to work with structs returned by LLVM intrinsics, such as the with.overflow family of intrinsics.
+#   - It is fine to use aggregate types without creating values. For example, they are commonly used in getelementptr instructions or attributes like sret.
+#
+# Furthermore for aggregate types like struct we need to check the number of elements
+# - https://groups.google.com/g/llvm-dev/c/CafdpEzOEp0
+# - https://stackoverflow.com/questions/27386912/prevent-clang-from-expanding-arguments-that-are-aggregate-types
+# - https://people.freebsd.org/~obrien/amd64-elf-abi.pdf
+# Though that might be overkill for functions tagged 'internal' linkage and 'Fast' CC
+#
+# Hopefully the compiler will remove the unnecessary lod/store/register movement, especially when inlining.
+
+proc wrapTypesForFnCall[N: static int](
+        asy: AssemblerLLVM,
+        paramTypes: array[N, TypeRef]
+      ): tuple[wrapped, src: array[N, TypeRef]] =
+  ## Wrap parameters that would need more than 3x registers
+  ## into a pointer.
+  ## There are 2 such cases:
+  ## - An array/struct of more than 3 elements, for example 4x uint32
+  ## - A type larger than 3x the pointer size, for example 4x uint64
+  ## Vectors are passed by special SIMD registers
+  ##
+  ## Due to LLVM opaque pointers, we return the wrapped and src types
+
+  for i in 0 ..< paramTypes.len:
+    let ty = paramTypes[i]
+    let tk = ty.getTypeKind()
+    if tk in {tkVector, tkScalableVector}:
+      ## There are special SIMD registers for vectors
+      result.wrapped[i] = paramTypes[i]
+      result.src[i]     = paramTypes[i]
+    elif asy.datalayout.getAbiSize(ty).int32 > 3*asy.psize:
+      result.wrapped[i] = pointer_t(paramTypes[i])
+      result.src[i]     = paramTypes[i]
+    else:
+      case tk
+      of tkArray:
+        if ty.getArrayLength() >= 3:
+          result.wrapped[i] = pointer_t(paramTypes[i])
+          result.src[i]     = paramTypes[i]
+        else:
+          result.wrapped[i] = paramTypes[i]
+          result.src[i]     = paramTypes[i]
+      of tkStruct:
+        if ty.getNumElements() >= 3:
+          result.wrapped[i] = pointer_t(paramTypes[i])
+          result.src[i]     = paramTypes[i]
+        else:
+          result.wrapped[i] = paramTypes[i]
+          result.src[i]     = paramTypes[i]
+      else:
+        result.wrapped[i] = paramTypes[i]
+        result.src[i]     = paramTypes[i]
+
+proc addAttributes(asy: Assembler_LLVM, fn: ValueRef, attrs: set[AttrKind]) =
+  for attr in attrs:
+    fn.addAttribute(kAttrFnIndex, asy.attrs[attr])
+
+  fn.addAttribute(kAttrFnIndex, asy.attrs[kHot])
+
+template llvmFnDef[N: static int](
+          asy: Assembler_LLVM,
+          name, sectionName: string,
+          returnType: TypeRef,
+          paramTypes: array[N, TypeRef],
+          internal: bool,
+          attrs: set[AttrKind],
+          body: untyped) =
+  ## This setups common prologue to implement a function in LLVM
+  ## Function parameters are available with the `llvmParams` magic variable
+  let paramsTys = asy.wrapTypesForFnCall(paramTypes)
+
+  var fn = asy.module.getFunction(cstring name)
+  if fn.pointer.isNil():
+    var savedLoc = asy.br.getInsertBlock()
+
+    let fnTy = function_t(returnType, paramsTys.wrapped)
+    fn = asy.module.addFunction(cstring name, fnTy)
+
+    asy.fns[name] = (fnTy, fn, internal)
+
+    let blck = asy.ctx.appendBasicBlock(fn)
+    asy.br.positionAtEnd(blck)
+
+    if savedLoc.pointer.isNil():
+      # We're installing the first function
+      # of the call tree, return to its basic block
+      savedLoc = blck
+
+    let llvmParams {.inject.} = unpackParams(asy.br, paramsTys)
+    template tagParameter(idx: int, attr: string) {.inject.} =
+      let a = asy.ctx.createAttr(attr)
+      fn.addAttribute(cint idx, a)
+    body
+
+    if internal:
+      fn.setFnCallConv(Fast)
+      fn.setLinkage(linkInternal)
+    else:
+      asy.setPublic(fn)
+    fn.setSection(sectionName)
+    asy.addAttributes(fn, attrs)
+
+    asy.br.positionAtEnd(savedLoc)
+
+template llvmInternalFnDef*[N: static int](
+          asy: Assembler_LLVM,
+          name, sectionName: string,
+          returnType: TypeRef,
+          paramTypes: array[N, TypeRef],
+          attrs: set[AttrKind] = {},
+          body: untyped) =
+  llvmFnDef(asy, name, sectionName, returnType, paramTypes, internal = true, attrs, body)
+
+template llvmPublicFnDef*[N: static int](
+          asy: Assembler_LLVM,
+          name, sectionName: string,
+          returnType: TypeRef,
+          paramTypes: array[N, TypeRef],
+          body: untyped) =
+  llvmFnDef(asy, name, sectionName, returnType, paramTypes, internal = false, {}, body)
+
+proc callFn*(
+      asy: Assembler_LLVM,
+      name: string,
+      params: openArray[ValueRef]): ValueRef {.discardable.} =
+
+  if asy.fns[name].ty.getReturnType().getTypeKind() == tkVoid:
+    result = asy.br.call2(asy.fns[name].ty, asy.fns[name].impl, params)
+  else:
+    result = asy.br.call2(asy.fns[name].ty, asy.fns[name].impl, params, cstring(name))
+
+  if asy.fns[name].internal:
+    result.setInstrCallConv(Fast)
+
+# ############################################################
+#
+#                      Forward to Builder
+#
+# ############################################################
+
+# {.experimental: "dotOperators".} dos not seem to work within templates?macros
+
+template load2*(asy: Assembler_LLVM, ty: TypeRef, `ptr`: ValueRef, name: cstring = ""): ValueRef =
+  asy.br.load2(ty, `ptr`, name)
+
+template store*(asy: Assembler_LLVM, dst, src: ValueRef, name: cstring = "") =
+  asy.br.store(src, dst)
diff --git a/constantine/math_compiler/pub_fields.nim b/constantine/math_compiler/pub_fields.nim
new file mode 100644
index 00000000..51f92da3
--- /dev/null
+++ b/constantine/math_compiler/pub_fields.nim
@@ -0,0 +1,30 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/llvm,
+  ./ir,
+  ./impl_fields_globals,
+  ./impl_fields_sat
+
+proc genFpAdd*(asy: Assembler_LLVM, fd: FieldDescriptor): string =
+  ## Generate a public field addition proc
+  ## with signature
+  ##   void name(FieldType r, FieldType a, FieldType b)
+  ## with r the result and a, b the operants
+  ## and return the corresponding name to call it
+
+  let name = fd.name & "_add"
+  asy.llvmPublicFnDef(name, "ctt." & fd.name, asy.void_t, [fd.fieldTy, fd.fieldTy, fd.fieldTy]):
+    let M = asy.getModulusPtr(fd)
+
+    let (r, a, b) = llvmParams
+    asy.modadd(fd, r, a, b, M)
+    asy.br.retVoid()
+
+  return name
diff --git a/constantine/platforms/abis/llvm_abi.nim b/constantine/platforms/abis/llvm_abi.nim
index 7fcf342f..81b19b20 100644
--- a/constantine/platforms/abis/llvm_abi.nim
+++ b/constantine/platforms/abis/llvm_abi.nim
@@ -38,12 +38,14 @@ type
   ContextRef* = distinct pointer
   ModuleRef* = distinct pointer
   TargetRef* = distinct pointer
+  TargetDataRef* = distinct pointer
   ExecutionEngineRef* = distinct pointer
   TargetMachineRef* = distinct pointer
   PassBuilderOptionsRef* = distinct pointer
   TypeRef* = distinct pointer
   ValueRef* = distinct pointer
   MetadataRef = distinct pointer
+  AttributeRef* = distinct pointer
   LLVMstring = distinct cstring
   ErrorMessageString = distinct cstring
     ## A string with a buffer owned by LLVM
@@ -122,16 +124,19 @@ proc verify(module: ModuleRef, failureAction: VerifierFailureAction, msg: var LL
 
 {.push used.}
 proc initializeX86AsmPrinter() {.importc: "LLVMInitializeX86AsmPrinter".}
+proc initializeX86AsmParser() {.importc: "LLVMInitializeX86AsmParser".}
 proc initializeX86Target() {.importc: "LLVMInitializeX86Target".}
 proc initializeX86TargetInfo() {.importc: "LLVMInitializeX86TargetInfo".}
 proc initializeX86TargetMC() {.importc: "LLVMInitializeX86TargetMC".}
 
 proc initializeNVPTXAsmPrinter() {.importc: "LLVMInitializeNVPTXAsmPrinter".}
+proc initializeNVPTXAsmParser() {.importc: "LLVMInitializeNVPTXAsmParser".}
 proc initializeNVPTXTarget() {.importc: "LLVMInitializeNVPTXTarget".}
 proc initializeNVPTXTargetInfo() {.importc: "LLVMInitializeNVPTXTargetInfo".}
 proc initializeNVPTXTargetMC() {.importc: "LLVMInitializeNVPTXTargetMC".}
 
 proc initializeAMDGPUAsmPrinter() {.importc: "LLVMInitializeAMDGPUAsmPrinter".}
+proc initializeAMDGPUAsmParser() {.importc: "LLVMInitializeAMDGPUAsmParser".}
 proc initializeAMDGPUTarget() {.importc: "LLVMInitializeAMDGPUTarget".}
 proc initializeAMDGPUTargetInfo() {.importc: "LLVMInitializeAMDGPUTargetInfo".}
 proc initializeAMDGPUTargetMC() {.importc: "LLVMInitializeAMDGPUTargetMC".}
@@ -186,19 +191,33 @@ type
   CodeGenFileType* {.size: sizeof(cint).} = enum
     AssemblyFile, ObjectFile
 
-  TargetDataRef* = distinct pointer
-  TargetLibraryInfoRef* = distinct pointer
-
 # "<llvm-c/TargetMachine.h>"
 proc createTargetMachine*(
        target: TargetRef, triple, cpu, features: cstring,
        level: CodeGenOptLevel, reloc: RelocMode, codeModel: CodeModel): TargetMachineRef {.importc: "LLVMCreateTargetMachine".}
 proc dispose*(m: TargetMachineRef) {.importc: "LLVMDisposeTargetMachine".}
 
-proc createTargetDataLayout*(t: TargetMachineRef): TargetDataRef {.importc: "LLVMCreateTargetDataLayout".}
+proc getDataLayout*(t: TargetMachineRef): TargetDataRef {.importc: "LLVMCreateTargetDataLayout".}
+proc getDataLayout*(module: ModuleRef): TargetDataRef {.importc: "LLVMGetModuleDataLayout".}
 proc dispose*(m: TargetDataRef) {.importc: "LLVMDisposeTargetData".}
 proc setDataLayout*(module: ModuleRef, dataLayout: TargetDataRef) {.importc: "LLVMSetModuleDataLayout".}
 
+proc getPointerSize*(datalayout: TargetDataRef): cuint {.importc: "LLVMPointerSize".}
+proc getSizeInBits*(datalayout: TargetDataRef, ty: TypeRef): culonglong {.importc: "LLVMSizeOfTypeInBits".}
+  ## Computes the size of a type in bits for a target.
+proc getStoreSize*(datalayout: TargetDataRef, ty: TypeRef): culonglong {.importc: "LLVMStoreSizeOfType".}
+  ## Computes the storage size of a type in bytes for a target.
+proc getAbiSize*(datalayout: TargetDataRef, ty: TypeRef): culonglong {.importc: "LLVMABISizeOfType".}
+  ## Computes the ABI size of a type in bytes for a target.
+
+type
+  ByteOrder {.size: sizeof(cint).} = enum
+    kBigEndian
+    kLittleEndian
+
+proc getEndianness*(datalayout: TargetDataref): ByteOrder {.importc: "LLVMByteOrder".}
+
+
 proc targetMachineEmitToFile*(t: TargetMachineRef, m: ModuleRef, fileName: cstring,
                              codegen: CodeGenFileType, errorMessage: var LLVMstring): LLVMBool {.importc: "LLVMTargetMachineEmitToFile".}
 proc targetMachineEmitToMemoryBuffer*(t: TargetMachineRef, m: ModuleRef,
@@ -280,12 +299,17 @@ proc getIntTypeWidth*(ty: TypeRef): uint32 {.importc: "LLVMGetIntTypeWidth".}
 proc struct_t*(
        ctx: ContextRef,
        elemTypes: openArray[TypeRef],
-       packed: LlvmBool): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMStructTypeInContext".}
+       packed = LlvmBool(false)): TypeRef {.wrapOpenArrayLenType: cuint, importc: "LLVMStructTypeInContext".}
 proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".}
+proc vector_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMVectorType".}
+  ## Create a SIMD vector type (for SSE, AVX or Neon for example)
 
 proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.used, importc: "LLVMPointerType".}
 
 proc getElementType*(arrayOrVectorTy: TypeRef): TypeRef {.importc: "LLVMGetElementType".}
+proc getArrayLength*(arrayTy: TypeRef): uint64 {.importc: "LLVMGetArrayLength2".}
+proc getNumElements*(structTy: TypeRef): cuint {.importc: "LLVMCountStructElementTypes".}
+proc getVectorSize*(vecTy: TypeRef): cuint {.importc: "LLVMGetVectorSize".}
 
 # Functions
 # ------------------------------------------------------------
@@ -537,6 +561,44 @@ type
     # The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
 
+type
+  Linkage {.size: sizeof(cint).} = enum
+    # https://web.archive.org/web/20240224034505/https://bluesadi.me/2024/01/05/Linkage-types-in-LLVM/
+    # Weak linkage means unreferenced globals may not be discarded when linking.
+    #
+    # Also relevant: https://stackoverflow.com/a/55599037
+    #   The necessity of making code relocatable in order allow shared objects to be loaded a different addresses
+    #   in different process means that statically allocated variables,
+    #   whether they have global or local scope,
+    #   can't be accessed with directly with a single instruction on most architectures.
+    #   The only exception I know of is the 64-bit x86 architecture, as you see above.
+    #   It supports memory operands that are both PC-relative and have large 32-bit displacements
+    #   that can reach any variable defined in the same component.
+    linkExternal,                   ## Externally visible function
+    linkAvailableExternally,        ## no description
+    linkOnceAny,                    ## Keep one copy of function when linking (inline)
+    linkOnceODR,                    ## Same, but only replaced by something equivalent. (ODR: one definition rule)
+    linkOnceODRAutoHide,            ## Obsolete
+    linkWeakAny,                    ## Keep one copy of function when linking (weak)
+    linkWeakODR,                    ## Same, but only replaced by something equivalent.
+    linkAppending,                  ## Special purpose, only applies to global arrays
+    linkInternal,                   ## Rename collisions when linking (static functions)
+    linkPrivate,                    ## Like Internal, but omit from symbol table
+    linkDLLImport,                  ## Obsolete
+    linkDLLExport,                  ## Obsolete
+    linkExternalWeak,               ## ExternalWeak linkage description
+    linkGhost,                      ## Obsolete
+    linkCommon,                     ## Tentative definitions
+    linkLinkerPrivate,              ## Like Private, but linker removes.
+    linkLinkerPrivateWeak           ## Like LinkerPrivate, but is weak.
+
+  Visibility {.size: sizeof(cint).} = enum
+    # Note: Function with internal or private linkage must have default visibility
+    visDefault
+    visHidden
+    visProtected
+
+
 proc function_t*(
        returnType: TypeRef,
        paramTypes: openArray[TypeRef],
@@ -546,11 +608,33 @@ proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc:
   ## Declare a function `name` in a module.
   ## Returns a handle to specify its instructions
 
+proc getFunction*(m: ModuleRef, name: cstring): ValueRef {.importc: "LLVMGetNamedFunction".}
+  ## Get a function by name from the curent module.
+  ## Return nil if not found.
+
 proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}
 proc countParamTypes*(functionTy: TypeRef): uint32 {.importc: "LLVMCountParamTypes".}
 
-proc getCallingConvention*(function: ValueRef): CallingConvention {.importc: "LLVMGetFunctionCallConv".}
-proc setCallingConvention*(function: ValueRef, cc: CallingConvention) {.importc: "LLVMSetFunctionCallConv".}
+proc getCalledFunctionType*(fn: ValueRef): TypeRef {.importc: "LLVMGetCalledFunctionType".}
+
+proc getFnCallConv*(function: ValueRef): CallingConvention {.importc: "LLVMGetFunctionCallConv".}
+proc setFnCallConv*(function: ValueRef, cc: CallingConvention) {.importc: "LLVMSetFunctionCallConv".}
+
+proc getInstrCallConv*(instr: ValueRef): CallingConvention {.importc: "LLVMGetInstructionCallConv".}
+proc setInstrCallConv*(instr: ValueRef, cc: CallingConvention) {.importc: "LLVMSetInstructionCallConv".}
+
+type
+  AttributeIndex* {.size: sizeof(cint).} = enum
+    ## Attribute index is either -1 for the function
+    ## 0 for the return value
+    ## or 1..n for each function parameter
+    kAttrFnIndex = -1
+    kAttrRetIndex = 0
+
+proc toAttrId*(name: openArray[char]): cuint {.importc: "LLVMGetEnumAttributeKindForName".}
+proc toAttr*(ctx: ContextRef, attr_id: uint64, val = 0'u64): AttributeRef {.importc: "LLVMCreateEnumAttribute".}
+proc addAttribute*(fn: ValueRef, index: cint, attr: AttributeRef) {.importc: "LLVMAddAttributeAtIndex".}
+proc addAttribute*(fn: ValueRef, index: AttributeIndex, attr: AttributeRef) {.importc: "LLVMAddAttributeAtIndex".}
 
 # ############################################################
 #
@@ -560,6 +644,18 @@ proc setCallingConvention*(function: ValueRef, cc: CallingConvention) {.importc:
 
 # {.push header: "<llvm-c/Core.h>".}
 
+proc getGlobal*(module: ModuleRef, name: cstring): ValueRef {.importc: "LLVMGetNamedGlobal".}
+proc addGlobal*(module: ModuleRef, ty: TypeRef, name: cstring): ValueRef {.importc: "LLVMAddGlobal".}
+proc setGlobal*(globalVar: ValueRef, constantVal: ValueRef) {.importc: "LLVMSetInitializer".}
+proc setImmutable*(globalVar: ValueRef, immutable = LlvmBool(true)) {.importc: "LLVMSetGlobalConstant".}
+
+proc getGlobalParent*(global: ValueRef): ModuleRef {.importc: "LLVMGetGlobalParent".}
+
+proc setLinkage*(global: ValueRef, linkage: Linkage) {.importc: "LLVMSetLinkage".}
+proc setVisibility*(global: ValueRef, vis: Visibility) {.importc: "LLVMSetVisibility".}
+proc setAlignment*(v: ValueRef, bytes: cuint) {.importc: "LLVMSetAlignment".}
+proc setSection*(global: ValueRef, section: cstring) {.importc: "LLVMSetSection".}
+
 proc getTypeOf*(v: ValueRef): TypeRef {.importc: "LLVMTypeOf".}
 proc getValueName2(v: ValueRef, rLen: var csize_t): cstring {.used, importc: "LLVMGetValueName2".}
   ## Returns the name of a valeu if it exists.
@@ -578,6 +674,9 @@ proc toLLVMstring(v: ValueRef): LLVMstring {.used, importc: "LLVMPrintValueToStr
 # https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html
 
 proc constInt(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.used, importc: "LLVMConstInt".}
+proc constIntOfArbitraryPrecision(ty: TypeRef, numWords: cuint, words: ptr uint64): ValueRef {.used, importc: "LLVMConstIntOfArbitraryPrecision".}
+proc constIntOfStringAndSize(ty: TypeRef, text: openArray[char], radix: uint8): ValueRef {.used, importc: "LLVMConstIntOfStringAndSize".}
+
 proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".}
 
 proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".}
@@ -587,6 +686,15 @@ proc constArray*(
        constantVals: openArray[ValueRef],
     ): ValueRef {.wrapOpenArrayLenType: cuint, importc: "LLVMConstArray".}
 
+# Undef & Poison
+# ------------------------------------------------------------
+# https://llvm.org/devmtg/2020-09/slides/Lee-UndefPoison.pdf
+
+proc poison*(ty: TypeRef): ValueRef {.importc: "LLVMGetPoison".}
+proc undef*(ty: TypeRef): ValueRef {.importc: "LLVMGetUndef".}
+
+
+
 # ############################################################
 #
 #                      IR builder
@@ -601,17 +709,17 @@ type
     ##  An instruction builder represents a point within a basic block and is
     ##  the exclusive means of building instructions using the C interface.
 
-  IntPredicate* {.size: sizeof(cint).} = enum
-    IntEQ = 32               ## equal
-    IntNE                    ## not equal
-    IntUGT                   ## unsigned greater than
-    IntUGE                   ## unsigned greater or equal
-    IntULT                   ## unsigned less than
-    IntULE                   ## unsigned less or equal
-    IntSGT                   ## signed greater than
-    IntSGE                   ## signed greater or equal
-    IntSLT                   ## signed less than
-    IntSLE                   ## signed less or equal
+  Predicate* {.size: sizeof(cint).} = enum
+    kEQ = 32               ## equal
+    kNE                    ## not equal
+    kUGT                   ## unsigned greater than
+    kUGE                   ## unsigned greater or equal
+    kULT                   ## unsigned less than
+    kULE                   ## unsigned less or equal
+    kSGT                   ## signed greater than
+    kSGE                   ## signed greater or equal
+    kSLT                   ## signed less than
+    kSLE                   ## signed less or equal
 
   InlineAsmDialect* {.size: sizeof(cint).} = enum
     InlineAsmDialectATT
@@ -622,7 +730,7 @@ type
 # Instantiation
 # ------------------------------------------------------------
 
-proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
+proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring = ""): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".}
   ## Append a basic block to the end of a function
 
 proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".}
@@ -675,19 +783,27 @@ proc call2*(
 
 proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildAdd".}
 proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWAdd".}
+  ## Addition No Signed Wrap, i.e. guaranteed to not overflow
 proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWAdd".}
+  ## Addition No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSub".}
 proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWSub".}
+  ## Substraction No Signed Wrap, i.e. guaranteed to not overflow
 proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWSub".}
+  ## Substraction No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNeg".}
 proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWNeg".}
+  ## Negation No Signed Wrap, i.e. guaranteed to not overflow
 proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWNeg".}
+  ## Negation No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildMul".}
 proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNSWMul".}
+  ## Multiplication No Signed Wrap, i.e. guaranteed to not overflow
 proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNUWMul".}
+  ## Multiplication No Unsigned Wrap, i.e. guaranteed to not overflow
 
 proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildUDiv".}
 proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildExactUDiv".}
@@ -706,9 +822,9 @@ proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring = ""): ValueR
 proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildNot".}
 proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildSelect".}
 
-proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
+proc icmp*(builder: BuilderRef, op: Predicate, lhs, rhs: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildICmp".}
 
-proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitcast".}
+proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildBitCast".}
 proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildTrunc".}
 proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildZExt".}
   ## Zero-extend
@@ -722,7 +838,7 @@ proc alloca*(builder: BuilderRef, ty: TypeRef, name: cstring = ""): ValueRef {.i
 proc allocaArray*(builder: BuilderRef, ty: TypeRef, length: ValueRef, name: cstring = ""): ValueRef {.importc: "LLVMBuildArrayAlloca".}
 
 proc extractValue*(builder: BuilderRef, aggVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.importc: "LLVMBuildExtractValue".}
-proc insertValue*(builder: BuilderRef, aggVal: ValueRef, eltVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.discardable, importc: "LLVMBuildInsertValue".}
+proc insertValue*(builder: BuilderRef, aggVal: ValueRef, eltVal: ValueRef, index: uint32, name: cstring = ""): ValueRef {.importc: "LLVMBuildInsertValue".}
 
 proc getElementPtr2*(
        builder: BuilderRef,
diff --git a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
index 345acd8f..f0e438a2 100644
--- a/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
+++ b/constantine/platforms/intrinsics/extended_precision_64bit_uint128.nim
@@ -85,7 +85,7 @@ func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
       {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
 
 func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
-  ## Extended precision multiplication
+  ## Signed extended precision multiplication
   ## (hi, lo) <- a*b
   ##
   ## Inputs are intentionally unsigned
@@ -103,4 +103,4 @@ func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
       {.emit:[lo, " = (NU64)", dblPrec,";"].}
     else:
       {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
-      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
\ No newline at end of file
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
diff --git a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
index 3216b859..06bde04d 100644
--- a/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
+++ b/constantine/platforms/intrinsics/extended_precision_x86_64_msvc.nim
@@ -77,12 +77,12 @@ func smul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_mul12
   ## as we use their unchecked raw representation for cryptography
 
 func smul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
-  ## Extended precision multiplication
+  ## Signed extended precision multiplication
   ## (hi, lo) <- a*b
   ##
   ## Inputs are intentionally unsigned
   ## as we use their unchecked raw representation for cryptography
-  ## 
+  ##
   ## This is constant-time on most hardware
   ## See: https://www.bearssl.org/ctmul.html
-  lo = smul128(a, b, hi)
\ No newline at end of file
+  lo = smul128(a, b, hi)
diff --git a/constantine/platforms/llvm/nvidia_inlineasm.nim b/constantine/platforms/llvm/asm_nvidia.nim
similarity index 100%
rename from constantine/platforms/llvm/nvidia_inlineasm.nim
rename to constantine/platforms/llvm/asm_nvidia.nim
diff --git a/research/codegen/x86_inlineasm.nim b/constantine/platforms/llvm/asm_x86.nim
similarity index 100%
rename from research/codegen/x86_inlineasm.nim
rename to constantine/platforms/llvm/asm_x86.nim
diff --git a/constantine/platforms/llvm/llvm.nim b/constantine/platforms/llvm/llvm.nim
index d222a306..e87e7989 100644
--- a/constantine/platforms/llvm/llvm.nim
+++ b/constantine/platforms/llvm/llvm.nim
@@ -7,6 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import constantine/platforms/abis/llvm_abi {.all.}
+import std/macros
 export llvm_abi
 
 # ############################################################
@@ -146,11 +147,17 @@ proc emitTo*[T: string or seq[byte]](t: TargetMachineRef, m: ModuleRef, codegen:
 # Builder
 # ------------------------------------------------------------
 
+proc getCurrentFunction*(builder: BuilderRef): ValueRef =
+  builder.getInsertBlock().getBasicBlockParent()
+
 proc getContext*(builder: BuilderRef): ContextRef =
   # LLVM C API does not expose IRBuilder.getContext()
   # making this unnecessary painful
   # https://github.com/llvm/llvm-project/issues/59875
-  builder.getInsertBlock().getBasicBlockParent().getTypeOf().getContext()
+  builder.getCurrentFunction().getTypeOf().getContext()
+
+proc getCurrentModule*(builder: BuilderRef): ModuleRef =
+  builder.getCurrentFunction().getGlobalParent()
 
 # Types
 # ------------------------------------------------------------
@@ -172,12 +179,37 @@ proc array_t*(elemType: TypeRef, elemCount: SomeInteger): TypeRef {.inline.}=
 proc function_t*(returnType: TypeRef, paramTypes: openArray[TypeRef]): TypeRef {.inline.} =
   function_t(returnType, paramTypes, isVarArg = LlvmBool(false))
 
+# Functions
+# ------------------------------------------------------------
+
+proc createAttr*(ctx: ContextRef, name: openArray[char]): AttributeRef =
+  ctx.toAttr(name.toAttrId())
+
+proc toTypes*[N: static int](v: array[N, ValueRef]): array[N, TypeRef] =
+  for i in 0 ..< v.len:
+    result[i] = v[i].getTypeOf()
+
+macro unpackParams*[N: static int](
+        br: BuilderRef,
+        paramsTys: tuple[wrapped, src: array[N, TypeRef]]): untyped =
+  ## Unpack function parameters.
+  ##
+  ## The new function basic block MUST be setup before calling unpackParams.
+  ##
+  ## In the future we may automatically unwrap types.
+
+  result = nnkPar.newTree()
+  for i in 0 ..< N:
+    result.add quote do:
+      # let tySrc = `paramsTys`.src[`i`]
+      # let tyCC = `paramsTys`.wrapped[`i`]
+      let fn = `br`.getCurrentFunction()
+      fn.getParam(uint32 `i`)
+
 # Values
 # ------------------------------------------------------------
 
-type
-  ConstValueRef* = distinct ValueRef
-  AnyValueRef* = ValueRef or ConstValueRef
+proc isNil*(v: ValueRef): bool {.borrow.}
 
 proc getName*(v: ValueRef): string =
   var rLen: csize_t
@@ -186,7 +218,5 @@ proc getName*(v: ValueRef): string =
   result = newString(rLen.int)
   copyMem(result[0].addr, rStr, rLen.int)
 
-proc constInt*(ty: TypeRef, n: uint64, signExtend = false): ConstValueRef {.inline.} =
-  ConstValueRef constInt(ty, culonglong(n), LlvmBool(signExtend))
-
-proc getTypeOf*(v: ConstValueRef): TypeRef {.borrow.}
+proc constInt*(ty: TypeRef, n: SomeInteger, signExtend = false): ValueRef {.inline.} =
+  constInt(ty, culonglong(n), LlvmBool(signExtend))
diff --git a/constantine/platforms/llvm/super_instructions.nim b/constantine/platforms/llvm/super_instructions.nim
new file mode 100644
index 00000000..08646288
--- /dev/null
+++ b/constantine/platforms/llvm/super_instructions.nim
@@ -0,0 +1,392 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./llvm
+
+# ############################################################
+#
+#        LLVM IR super-instructions
+#
+# ############################################################
+
+# This defines a collection of LLVM IR super-instructions
+# Ideally those super-instructions compile-down
+# to ISA optimized single instructions
+#
+# To ensure this, tests can be consulted at:
+#   https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/
+
+# Add-carry:
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/add-of-carry.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/addcarry2.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/adx-intrinsics-upgrade.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/adc.ll
+#
+# Sub-borrow
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/sub-with-overflow.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/cgp-usubo.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/apx/sbb.ll
+#
+# Multiplication
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx32.ll
+#  - https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/test/CodeGen/X86/mulx64.ll
+
+# Warning 1:
+#
+#   There is no guarantee of constant-time with LLVM IR
+#   It MAY introduce branches.
+#   For workload that involves private keys or secrets
+#   assembly MUST be used
+#
+#   Alternatively an assembly source file must be generated
+#   and checked in the repo to avoid regressions should
+#   the compiler "progress"
+#
+#   - https://github.com/mratsim/constantine/wiki/Constant-time-arithmetics#fighting-the-compiler
+#   - https://blog.cr.yp.to/20240803-clang.html
+#   - https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf
+#
+# Warning 2:
+#
+#   Unfortunately implementing unrolled bigint arithmetic using word size
+#   is fraught with perils for add-carry / sub-borrow
+#   AMDGPU crash: https://github.com/llvm/llvm-project/issues/102058
+#   ARM64 missed optim: https://github.com/llvm/llvm-project/issues/102062
+#
+#   and while using @llvm.usub.with.overflow.i64 allows ARM64 to solve the missing optimization
+#   it is also missed on AMDGPU (or nvidia)
+
+proc hi(bld: BuilderRef, val: ValueRef, baseTy: TypeRef, oversize: uint32, prefix: string): ValueRef =
+  let ctx = bld.getContext()
+  let bits = baseTy.getIntTypeWidth()
+  let overTy = ctx.int_t(bits + oversize)
+
+  # %hi_shift_1 = zext i8 64 to i128
+  let s = constInt(ctx.int8_t(), oversize)
+  let shift = bld.zext(s, overTy, name = cstring(prefix & "S_"))
+  # %hiLarge_1 = lshr i128 %input, %hi_shift_1
+  let hiLarge = bld.lshr(val, shift, name = cstring(prefix & "L_"))
+  # %hi_1 = trunc i128 %hiLarge_1 to i64
+  let hi = bld.trunc(hiLarge, baseTy, name = cstring(prefix & "_"))
+
+  return hi
+
+const SectionName = "ctt.superinstructions"
+
+proc getInstrName(baseName: string, ty: TypeRef, builtin = false): string =
+  var w, v: int # Wordsize and vector size
+  if ty.getTypeKind() == tkInteger:
+    w = int ty.getIntTypeWidth()
+    v = 1
+  elif ty.getTypeKind() == tkVector:
+    v = int ty.getVectorSize()
+    w = int ty.getElementType().getIntTypeWidth()
+  else:
+    doAssert false, "Invalid input type: " & $ty
+
+  return baseName &
+          (if v != 1: ".v" & $v else: ".") &
+          (if builtin: "i" else: "u") & $w
+
+
+proc def_llvm_add_overflow*(ctx: ContextRef, m: ModuleRef, wordTy: TypeRef) =
+  let name = "llvm.uadd.with.overflow".getInstrName(wordTy, builtin = true)
+
+  let br {.inject.} = ctx.createBuilder()
+  defer: br.dispose()
+
+  var fn = m.getFunction(cstring name)
+  if fn.pointer.isNil():
+    let retTy = ctx.struct_t([wordTy, ctx.int1_t()])
+    let fnTy = function_t(retTy, [wordTy, wordTy])
+    discard m.addFunction(cstring name, fnTy)
+
+proc llvm_add_overflow*(br: BuilderRef, a, b: ValueRef, name = ""): tuple[carryOut, r: ValueRef] =
+  ## (cOut, result) <- a+b+cIn
+  let ty = a.getTypeOf()
+  let intrin_name = "llvm.uadd.with.overflow".getInstrName(ty, builtin = true)
+
+  let fn = br.getCurrentModule().getFunction(cstring intrin_name)
+  doAssert not fn.pointer.isNil, "Function '" & intrin_name & "' does not exist in the module\n"
+
+  let ctx = br.getContext()
+
+  let retTy = ctx.struct_t([ty, ctx.int1_t()])
+  let fnTy = function_t(retTy, [ty, ty])
+  let addo = br.call2(fnTy, fn, [a, b], cstring name)
+  let lo = br.extractValue(addo, 0, cstring(name & ".lo"))
+  let cOut = br.extractValue(addo, 1, cstring(name & ".carry"))
+  return (cOut, lo)
+
+proc def_llvm_sub_overflow*(ctx: ContextRef, m: ModuleRef, wordTy: TypeRef) =
+  let name = "llvm.usub.with.overflow".getInstrName(wordTy, builtin = true)
+
+  let br {.inject.} = ctx.createBuilder()
+  defer: br.dispose()
+
+  var fn = m.getFunction(cstring name)
+  if fn.pointer.isNil():
+    let retTy = ctx.struct_t([wordTy, ctx.int1_t()])
+    let fnTy = function_t(retTy, [wordTy, wordTy])
+    discard m.addFunction(cstring name, fnTy)
+
+proc llvm_sub_overflow*(br: BuilderRef, a, b: ValueRef, name = ""): tuple[borrowOut, r: ValueRef] =
+  ## (cOut, result) <- a+b+cIn
+  let ty = a.getTypeOf()
+  let intrin_name = "llvm.usub.with.overflow".getInstrName(ty, builtin = true)
+
+  let fn = br.getCurrentModule().getFunction(cstring intrin_name)
+  doAssert not fn.pointer.isNil, "Function '" & intrin_name & "' does not exist in the module\n"
+
+  let ctx = br.getContext()
+
+  let retTy = ctx.struct_t([ty, ctx.int1_t()])
+  let fnTy = function_t(retTy, [ty, ty])
+  let subo = br.call2(fnTy, fn, [a, b], cstring name)
+  let lo = br.extractValue(subo, 0, cstring(name & ".lo"))
+  let bOut = br.extractValue(subo, 1, cstring(name & ".borrow"))
+  return (bOut, lo)
+
+template defSuperInstruction[N: static int](
+            module: ModuleRef, baseName: string,
+            returnType: TypeRef,
+            paramTypes: array[N, TypeRef],
+            body: untyped) =
+  ## Boilerplate for super instruction definition
+  ## Creates a magic `llvmParams` variable to tuple-destructure
+  ## to access the inputs
+  ## and `br` for building the instructions
+  let ty = paramTypes[0]
+  let name = baseName.getInstrName(ty)
+
+  let ctx = module.getContext()
+  let br {.inject.} = ctx.createBuilder()
+  defer: br.dispose()
+
+  var fn = module.getFunction(cstring name)
+  if fn.pointer.isNil():
+    let fnTy = function_t(returnType, paramTypes)
+    fn = module.addFunction(cstring name, fnTy)
+    let blck = ctx.appendBasicBlock(fn)
+    br.positionAtEnd(blck)
+
+    let llvmParams {.inject.} = unpackParams(br, (paramTypes, paramTypes))
+    template tagParameter(idx: int, attr: string) {.inject, used.} =
+      let a = asy.ctx.createAttr(cstring attr)
+      fn.addAttribute(cint idx, a)
+    body
+
+    fn.setFnCallConv(Fast)
+    fn.setLinkage(linkInternal)
+    fn.setSection(SectionName)
+    fn.addAttribute(kAttrFnIndex, ctx.createAttr("alwaysinline"))
+
+proc def_addcarry*(ctx: ContextRef, m: ModuleRef, carryTy, wordTy: TypeRef) =
+  ## Define (carryOut, result) <- a+b+carryIn
+
+  let retType = ctx.struct_t([carryTy, wordTy])
+  let inType = [wordTy, wordTy, carryTy]
+
+  m.defSuperInstruction("addcarry", retType, inType):
+    let (a, b, carryIn) = llvmParams
+
+    let (carry0, add) = br.llvm_add_overflow(a, b, "a_plus_b")
+    let cIn = br.zext(carryIn, wordTy, name = "carryIn")
+    let (carry1, adc) = br.llvm_add_overflow(cIn, add, "a_plus_b_plus_cIn")
+    let carryOut = br.`or`(carry0, carry1, name = "carryOut")
+
+    var ret = br.insertValue(poison(retType), adc, 1, "lo")
+    ret = br.insertValue(ret, carryOut, 0, "ret")
+    br.ret(ret)
+
+proc addcarry*(br: BuilderRef, a, b, carryIn: ValueRef): tuple[carryOut, r: ValueRef] =
+  ## (cOut, result) <- a+b+cIn
+  let ty = a.getTypeOf()
+  let tyC = carryIn.getTypeOf()
+  let name = "addcarry".getInstrName(ty)
+
+  let fn = br.getCurrentModule().getFunction(cstring name)
+  doAssert not fn.pointer.isNil, "Function '" & name & "' does not exist in the module\n"
+
+  let retTy = br.getContext().struct_t([tyC, ty])
+  let fnTy = function_t(retTy, [ty, ty, tyC])
+  let adc = br.call2(fnTy, fn, [a, b, carryIn], name = "adc")
+  adc.setInstrCallConv(Fast)
+  let lo = br.extractValue(adc, 1, name = "adc.lo")
+  let cOut = br.extractValue(adc, 0, name = "adc.carry")
+  return (cOut, lo)
+
+proc def_subborrow*(ctx: ContextRef, m: ModuleRef, borrowTy, wordTy: TypeRef) =
+  ## Define (borrowOut, result) <- a-b-borrowIn
+
+  let retType = ctx.struct_t([borrowTy, wordTy])
+  let inType = [wordTy, wordTy, borrowTy]
+
+  m.defSuperInstruction("subborrow", retType, inType):
+    let (a, b, borrowIn) = llvmParams
+
+    let (borrow0, sub) = br.llvm_sub_overflow(a, b, "a_minus_b")
+    let bIn = br.zext(borrowIn, wordTy, name = "borrowIn")
+    let (borrow1, sbb) = br.llvm_sub_overflow(sub, bIn, "sbb")
+    let borrowOut = br.`or`(borrow0, borrow1, name = "borrowOut")
+
+    var ret = br.insertValue(poison(retType), sbb, 1, "lo")
+    ret = br.insertValue(ret, borrowOut, 0, "ret")
+    br.ret(ret)
+
+proc subborrow*(br: BuilderRef, a, b, borrowIn: ValueRef): tuple[borrowOut, r: ValueRef] =
+  ## (cOut, result) <- a+b+cIn
+  let ty = a.getTypeOf()
+  let tyC = borrowIn.getTypeOf()
+  let name = "subborrow".getInstrName(ty)
+
+  let fn = br.getCurrentModule().getFunction(cstring name)
+  doAssert not fn.pointer.isNil, "Function '" & name & "' does not exist in the module\n"
+
+  let retTy = br.getContext().struct_t([tyC, ty])
+  let fnTy = function_t(retTy, [ty, ty, tyC])
+  let sbb = br.call2(fnTy, fn, [a, b, borrowIn], name = "sbb")
+  sbb.setInstrCallConv(Fast)
+  let lo = br.extractValue(sbb, 1, name = "sbb.lo")
+  let bOut = br.extractValue(sbb, 0, name = "sbb.borrow")
+  return (bOut, lo)
+
+proc mulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "mulx0_")
+  let b = bld.zext(b, dblTy, name = "mulx1_")
+  let r = bld.mulNUW(a, b, name = "mulx_")
+
+  let lo = bld.trunc(r, ty, name = "mullo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "mulhi_")
+  return (hi, lo)
+
+proc smulExt*(bld: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Signed extended precision multiplication
+  ## (hi, lo) <- a*b
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.sext(a, dblTy, name = "smulx0_")
+  let b = bld.sext(b, dblTy, name = "smulx1_")
+  let r = bld.mulNSW(a, b, name = "smulx0_")
+
+  let lo = bld.trunc(r, ty, name = "smullo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "smulhi_")
+  return (hi, lo)
+
+proc muladd1*(bld: BuilderRef, a, b, c: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding any c cannot overflow
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "fmax0_")
+  let b = bld.zext(b, dblTy, name = "fmax1_")
+  let ab = bld.mulNUW(a, b, name = "fmax01_")
+
+  let c = bld.zext(c, dblTy, name = "fmax2_")
+  let r = bld.addNUW(ab, c, name = "fmax_")
+
+  let lo = bld.trunc(r, ty, name = "fmalo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "fmahi_")
+  return (hi, lo)
+
+proc muladd2*(bld: BuilderRef, a, b, c1, c2: ValueRef): tuple[hi, lo: ValueRef] =
+  ## Extended precision multiplication + addition + addition
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
+  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
+  let ctx = bld.getContext()
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "fmaa0_")
+  let b = bld.zext(b, dblTy, name = "fmaa1_")
+  let ab = bld.mulNUW(a, b, name = "fmaa01_")
+
+  let c1 = bld.zext(c1, dblTy, name = "fmaa2_")
+  let abc1 = bld.addNUW(ab, c1, name = "fmaa012_")
+  let c2 = bld.zext(c2, dblTy, name = "fmaa3_")
+  let r = bld.addNUW(abc1, c2, name = "fmaa_")
+
+  let lo = bld.trunc(r, ty, name = "fmaalo_")
+  let hi = bld.hi(r, ty, oversize = bits, prefix = "fmaahi_")
+  return (hi, lo)
+
+proc mulAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) =
+  ## (t, u, v) <- (t, u, v) + a * b
+  let ctx = bld.getContext()
+
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let x3ty = tuv.getTypeOf()
+  let x3bits = x3ty.getIntTypeWidth()
+
+  doAssert bits * 3 == x3bits
+
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "mac0_")
+  let b = bld.zext(b, dblTy, name = "mac1_")
+  let ab = bld.mulNUW(a, b, name = "mac01_")
+
+  let wide_ab = bld.zext(ab, x3ty, name = "mac01x_")
+  let r = bld.addNUW(tuv, wide_ab, "mac_")
+
+  tuv = r
+
+proc mulDoubleAcc*(bld: BuilderRef, tuv: var ValueRef, a, b: ValueRef) =
+  ## (t, u, v) <- (t, u, v) + 2 * a * b
+  let ctx = bld.getContext()
+
+  let ty = a.getTypeOf()
+  let bits = ty.getIntTypeWidth()
+
+  let x3ty = tuv.getTypeOf()
+  let x3bits = x3ty.getIntTypeWidth()
+
+  doAssert bits * 3 == x3bits
+
+  let dbl = bits shl 1
+  let dblTy = ctx.int_t(dbl)
+
+  let a = bld.zext(a, dblTy, name = "macd0_")
+  let b = bld.zext(b, dblTy, name = "macd1_")
+  let ab = bld.mulNUW(a, b, name = "macd01_")
+
+  let wide_ab = bld.zext(ab, x3ty, name = "macd01x_")
+  let r1 = bld.addNUW(tuv, wide_ab, "macdpart_")
+  let r2 = bld.addNUW(r1, wide_ab, "macd_")
+
+  tuv = r2
diff --git a/research/codegen/x86_instr.nim b/research/codegen/x86_instr.nim
deleted file mode 100644
index a4a19219..00000000
--- a/research/codegen/x86_instr.nim
+++ /dev/null
@@ -1,96 +0,0 @@
-# Constantine
-# Copyright (c) 2018-2019    Status Research & Development GmbH
-# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
-# Licensed and distributed under either of
-#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
-#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  constantine/platforms/abis/c_abi,
-  constantine/platforms/llvm/llvm,
-  constantine/platforms/primitives,
-  constantine/math_compiler/ir,
-  ./x86_inlineasm
-
-export x86_inlineasm
-
-# ############################################################
-#
-#                     x86 API
-#
-# ############################################################
-
-proc defMulExt*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_mulExt64"
-                 else: cstring"hw_mulExt32"
-
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-
-  let mulExtTy = if wordSize == 64: function_t(doublePrec_t, [asy.i64_t, asy.i64_t])
-                 else: function_t(doublePrec_t, [asy.i32_t, asy.i32_t])
-  let mulExtKernel = asy.module.addFunction(procName, mulExtTy)
-  let blck = asy.ctx.appendBasicBlock(mulExtKernel, "mulExtBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  let a = bld.zext(mulExtKernel.getParam(0), doublePrec_t)
-  let b = bld.zext(mulExtKernel.getParam(1), doublePrec_t)
-  let r = bld.mul(a, b)
-
-  bld.ret r
-
-  return (mulExtTy, mulExtKernel)
-
-proc defHi*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_hi64"
-                 else: cstring"hw_hi32"
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-  let singlePrec_t = if wordSize == 64: asy.i64_t
-                     else: asy.i32_t
-
-  let hiTy = function_t(singlePrec_t, [doublePrec_t])
-
-  let hiKernel = asy.module.addFunction(procName, hiTy)
-  let blck = asy.ctx.appendBasicBlock(hiKernel, "hiBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  # %1 = zext i32 64 to i128
-  let shift = bld.zext(constInt(asy.i32_t, culonglong wordSize, signExtend = LlvmBool(0)), doublePrec_t)
-  # %hiLarge = lshr i128 %input, %1
-  let hiLarge = bld.lshr(hiKernel.getParam(0), shift)
-  # %hi = trunc i128 %hiLarge to i64
-  let hi = bld.trunc(hiLarge, singlePrec_t)
-
-  bld.ret hi
-
-  return (hiTy, hiKernel)
-
-proc defLo*(asy: Assembler_LLVM, wordSize: int): FnDef =
-
-  let procName = if wordSize == 64: cstring"hw_lo64"
-                 else: cstring"hw_lo32"
-  let doublePrec_t = if wordSize == 64: asy.i128_t
-                     else: asy.i64_t
-  let singlePrec_t = if wordSize == 64: asy.i64_t
-                     else: asy.i32_t
-
-  let loTy = function_t(singlePrec_t, [doublePrec_t])
-
-  let loKernel = asy.module.addFunction(procName, loTy)
-  let blck = asy.ctx.appendBasicBlock(loKernel, "loBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  # %lo = trunc i128 %input to i64
-  let lo = bld.trunc(loKernel.getParam(0), singlePrec_t)
-  bld.ret lo
-  return (loTy, loKernel)
diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim
index c5c376fe..4ec5b67f 100644
--- a/research/codegen/x86_poc.nim
+++ b/research/codegen/x86_poc.nim
@@ -8,140 +8,121 @@
 
 import
   constantine/platforms/llvm/llvm,
-  constantine/platforms/primitives,
-  constantine/math_compiler/ir,
-  ./x86_instr
-
-echo "LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX"
-
-proc big_mul_gen(asy: Assembler_LLVM): FnDef =
-
-
-  let procName = "big_mul_64x4"
-  let N = 4
-  let ty = array_t(asy.i64_t, N)
-  let pty = pointer_t(ty)
-
-  let bigMulTy = function_t(asy.void_t, [pty, pty, pty])
-  let bigMulKernel = asy.module.addFunction(cstring procName, bigMulTy)
-  let blck = asy.ctx.appendBasicBlock(bigMulKernel, "bigMulBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  let (hiTy, hiKernel) = asy.defHi(64)
-  proc hi(builder: BuilderRef, a: ValueRef): ValueRef =
-    return builder.call2(
-      hiTy, hiKernel,
-      [a], "hi64_"
-    )
-
-  let (loTy, loKernel) = asy.defLo(64)
-  proc lo(builder: BuilderRef, a: ValueRef): ValueRef =
-    return builder.call2(
-      loTy, loKernel,
-      [a], "lo64_"
-    )
-
-  let (mulExtTy, mulExtKernel) = asy.defMulExt(64)
-  bld.positionAtEnd(blck)
-
-  proc mulx(builder: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] =
-    # LLVM does not support multipel return value at the moment
-    # https://nondot.org/sabre/LLVMNotes/MultipleReturnValues.txt
-    # So we don't create an LLVM function
-    let t = builder.call2(
-      mulExtTy, mulExtKernel,
-      [a, b], "mulx64_"
-    )
-
-    builder.positionAtEnd(blck)
-    let lo = builder.lo(t)
-    let hi = builder.hi(t)
-    return (hi, lo)
-
-  let r = bld.asArray(bigMulKernel.getParam(0), ty)
-  let a = bld.asArray(bigMulKernel.getParam(1), ty)
-  let b = bld.asArray(bigMulKernel.getParam(2), ty)
-
-  let t = bld.makeArray(ty)
-
-  block: # i = 0
-    # TODO: properly implement add/adc in pure LLVM
-
-    # TODO: ensure flags are cleared properly, compiler might optimize this away
-    t[0] = bld.`xor`(t[0], t[0])
-    let (hi, lo) = bld.mulx(a[0], b[0])
-    r[0] = lo
-    t[0] = hi
-
-    for j in 1 ..< N:
-      let (hi , lo) = bld.mulx(a[j], b[0])
-      t[j] = hi
-      # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target
-      discard bld.adcx_rr(t[j-1], lo) # Replace by LLVM IR uadd_with_overflow
-
-    # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target
-    discard bld.adcx_rr(t[N-1], 0)
-
-  # TODO: rotate t array
-
-  # TODO: impl i in 1 ..< N
-
-  bld.store(r, t)
-  bld.retVoid()
-  return (bigMulTy, bigMulKernel)
-
-when isMainModule:
-  # It's not the Nvidia PTX backend but it's fine
+  constantine/math_compiler/[ir, pub_fields]
+
+const Fields = [
+  (
+    "bn254_snarks_fp", 254,
+    "30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47"
+  ),
+  (
+    "bn254_snarks_fr", 254,
+    "30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001"
+  ),
+
+  (
+    "secp256k1_fp", 256,
+    "fffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f"
+  ),
+  (
+    "secp256k1_fr", 256,
+    "fffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141"
+  ),
+  (
+    "bls12_381_fp", 381,
+    "1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"
+  ),
+  (
+    "bls12_381_fr", 255,
+    "73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001"
+  ),
+  (
+    "bls12_377_fp", 377,
+    "01ae3a4617c510eac63b05c06ca1493b1a22d9f300f5138f1ef3622fba094800170b5d44300000008508c00000000001"
+  ),
+  (
+    "bls12_377_fr", 253,
+    "12ab655e9a2ca55660b44d1e5c37b00159aa76fed00000010a11800000000001"
+  ),
+  (
+    "bls24_315_fp", 315,
+    "4c23a02b586d650d3f7498be97c5eafdec1d01aa27a1ae0421ee5da52bde5026fe802ff40300001"
+  ),
+  (
+    "bls12_315_fr", 253,
+    "196deac24a9da12b25fc7ec9cf927a98c8c480ece644e36419d0c5fd00c00001"
+  ),
+  (
+    "bls24_317_fp", 317,
+    "1058CA226F60892CF28FC5A0B7F9D039169A61E684C73446D6F339E43424BF7E8D512E565DAB2AAB"
+  ),
+  (
+    "bls12_317_fr", 255,
+    "443F917EA68DAFC2D0B097F28D83CD491CD1E79196BF0E7AF000000000000001"
+  ),
+]
+
+proc t_field_add() =
   let asy = Assembler_LLVM.new(bkX86_64_Linux, cstring("x86_poc"))
-  let bigMul = asy.big_mul_gen()
+  for F in Fields:
+    let fd = asy.ctx.configureField(
+      F[0], F[1], F[2],
+      v = 1, w = 64)
 
-  asy.module.verify(AbortProcessAction)
+    asy.definePrimitives(fd)
+
+    discard asy.genFpAdd(fd)
 
   echo "========================================="
-  echo "LLVM IR\n"
+  echo "LLVM IR unoptimized\n"
 
   echo asy.module
   echo "========================================="
 
-
-  var engine: ExecutionEngineRef
-  initializeFullNativeTarget()
-  createJITCompilerForModule(engine, asy.module, optLevel = 0)
-
-  let jitMul = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}](
-    engine.getFunctionAddress("big_mul_64x4")
-  )
-
-  var r: array[4, uint64]
-  r.jitMul([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1])
-  echo "jitMul = ", r
-
-  # block:
-  #   Cleanup - Assembler_LLVM is auto-managed
-  #   engine.dispose()  # also destroys the module attached to it, which double_frees Assembler_LLVM asy.module
-  echo "LLVM JIT - calling big_mul_64x4 SUCCESS"
+  asy.module.verify(AbortProcessAction)
 
   # --------------------------------------------
-  # See the assembly- note it might be different from what the JIT compiler did
-
+  # See the assembly - note it might be different from what the JIT compiler did
+  initializeFullNativeTarget()
   const triple = "x86_64-pc-linux-gnu"
 
   let machine = createTargetMachine(
     target = toTarget(triple),
     triple = triple,
     cpu = "",
-    features = "adx,bmi2", # TODO check the proper way to pass options
-    level = CodeGenLevelAggressive,
+    features = "", # "adx,bmi2", # TODO check the proper way to pass options
+    level = CodeGenLevelDefault,
     reloc = RelocDefault,
     codeModel = CodeModelDefault
   )
 
+  # Due to https://github.com/llvm/llvm-project/issues/102868
+  # We want to reproduce the codegen from llc.cpp
+  # However we can't reproduce the code from either
+  # - LLVM16 https://github.com/llvm/llvm-project/blob/llvmorg-16.0.6/llvm/tools/llc/llc.cpp
+  #   need legacy PassManagerRef and the PassManagerBuilder that interfaces between the
+  #   legacy PssManagerRef and new PassBuilder has been deleted in LLVM17
+  #
+  # - and contrary to what is claimed in https://llvm.org/docs/NewPassManager.html#id2
+  #   the C API of PassBuilderRef is ghost town.
+  #
+  # So we somewhat reproduce the optimization passes from
+  # https://reviews.llvm.org/D145835
+
   let pbo = createPassBuilderOptions()
   pbo.setMergeFunctions()
   let err = asy.module.runPasses(
-    "default<O3>,function-attrs,memcpyopt,sroa,mem2reg,gvn,dse,instcombine,inline,adce",
+    # "default<O2>,memcpyopt,sroa,mem2reg,function-attrs,inline,gvn,dse,aggressive-instcombine,adce",
+    "function(require<targetir>,require<targetlibinfo>,require<inliner-size-estimator>,require<memdep>,require<da>)" &
+    ",function(aa-eval)" &
+    ",always-inline,hotcoldsplit,inferattrs,instrprof,recompute-globalsaa" &
+    ",cgscc(argpromotion,function-attrs)" &
+    ",require<inline-advisor>,partial-inliner,called-value-propagation" &
+    ",scc-oz-module-inliner,module-inline" & # Buggy optimization
+    ",function(verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,instsimplify)" &
+    ",function(lower-constant-intrinsics,consthoist,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,verify)" &
+    ",memcpyopt,sroa,dse,aggressive-instcombine,gvn,ipsccp,deadargelim,adce" &
+    "",
     machine,
     pbo
   )
@@ -154,174 +135,35 @@ when isMainModule:
     quit 1
 
   echo "========================================="
-  echo "Assembly\n"
+  echo "LLVM IR optimized\n"
 
-  echo machine.emitTo[:string](asy.module, AssemblyFile)
+  echo asy.module
   echo "========================================="
 
-  # Output
-  # ------------------------------------------------------------------
-
-  #[
-  LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX
-  =========================================
-  LLVM IR
-
-  ; ModuleID = 'x86_poc'
-  source_filename = "x86_poc"
-  target triple = "x86_64-pc-linux-gnu"
-
-  define void @big_mul_64x4(ptr %0, ptr %1, ptr %2) {
-  bigMulBody:
-    %3 = alloca [4 x i64], align 8
-    %4 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %5 = load i64, ptr %4, align 4
-    %6 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %7 = load i64, ptr %6, align 4
-    %8 = xor i64 %5, %7
-    %9 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    store i64 %8, ptr %9, align 4
-    %10 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 0
-    %11 = load i64, ptr %10, align 4
-    %12 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %13 = load i64, ptr %12, align 4
-    %mulx64_ = call i128 @hw_mulExt64(i64 %11, i64 %13)
-    %lo64_ = call i64 @hw_lo64(i128 %mulx64_)
-    %hi64_ = call i64 @hw_hi64(i128 %mulx64_)
-    %14 = getelementptr inbounds [4 x i64], ptr %0, i32 0, i32 0
-    store i64 %lo64_, ptr %14, align 4
-    %15 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    store i64 %hi64_, ptr %15, align 4
-    %16 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 1
-    %17 = load i64, ptr %16, align 4
-    %18 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %19 = load i64, ptr %18, align 4
-    %mulx64_1 = call i128 @hw_mulExt64(i64 %17, i64 %19)
-    %lo64_2 = call i64 @hw_lo64(i128 %mulx64_1)
-    %hi64_3 = call i64 @hw_hi64(i128 %mulx64_1)
-    %20 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1
-    store i64 %hi64_3, ptr %20, align 4
-    %21 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0
-    %22 = load i64, ptr %21, align 4
-    %23 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %22, i64 %lo64_2)
-    %24 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 2
-    %25 = load i64, ptr %24, align 4
-    %26 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %27 = load i64, ptr %26, align 4
-    %mulx64_4 = call i128 @hw_mulExt64(i64 %25, i64 %27)
-    %lo64_5 = call i64 @hw_lo64(i128 %mulx64_4)
-    %hi64_6 = call i64 @hw_hi64(i128 %mulx64_4)
-    %28 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2
-    store i64 %hi64_6, ptr %28, align 4
-    %29 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1
-    %30 = load i64, ptr %29, align 4
-    %31 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %30, i64 %lo64_5)
-    %32 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 3
-    %33 = load i64, ptr %32, align 4
-    %34 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0
-    %35 = load i64, ptr %34, align 4
-    %mulx64_7 = call i128 @hw_mulExt64(i64 %33, i64 %35)
-    %lo64_8 = call i64 @hw_lo64(i128 %mulx64_7)
-    %hi64_9 = call i64 @hw_hi64(i128 %mulx64_7)
-    %36 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3
-    store i64 %hi64_9, ptr %36, align 4
-    %37 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2
-    %38 = load i64, ptr %37, align 4
-    %39 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %38, i64 %lo64_8)
-    %40 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3
-    %41 = load i64, ptr %40, align 4
-    %42 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %41, i64 0)
-    %43 = load [4 x i64], ptr %3, align 4
-    store [4 x i64] %43, ptr %0, align 4
-    ret void
-  }
-
-  define i64 @hw_hi64(i128 %0) {
-  hiBody:
-    %1 = lshr i128 %0, 64
-    %2 = trunc i128 %1 to i64
-    ret i64 %2
-  }
-
-  define i64 @hw_lo64(i128 %0) {
-  loBody:
-    %1 = trunc i128 %0 to i64
-    ret i64 %1
-  }
-
-  define i128 @hw_mulExt64(i64 %0, i64 %1) {
-  mulExtBody:
-    %2 = zext i64 %0 to i128
-    %3 = zext i64 %1 to i128
-    %4 = mul i128 %2, %3
-    ret i128 %4
-  }
+  echo "========================================="
+  echo "Assembly\n"
 
-  =========================================
-  jitMul = [0, 0, 0, 0]
-  LLVM JIT - calling big_mul_64x4 SUCCESS
-  =========================================
-  Assembly
+  echo machine.emitTo[:string](asy.module, AssemblyFile)
+  echo "========================================="
 
-          .text
-          .file   "x86_poc"
-          .globl  big_mul_64x4
-          .p2align        4, 0x90
-          .type   big_mul_64x4,@function
-  big_mul_64x4:
-          .cfi_startproc
-          movq    %rdx, %rcx
-          movq    (%rdx), %rax
-          mulq    (%rsi)
-          movq    %rdx, %r8
-          movq    %rax, (%rdi)
-          movq    (%rcx), %rcx
-          movq    %rcx, %rax
-          mulq    8(%rsi)
-          movq    %rdx, %r9
-          movq    %rcx, %rax
-          mulq    16(%rsi)
-          movq    %rdx, %r10
-          movq    %rcx, %rax
-          mulq    24(%rsi)
-          movq    %r8, (%rdi)
-          movq    %r9, 8(%rdi)
-          movq    %r10, 16(%rdi)
-          movq    %rdx, 24(%rdi)
-          retq
-  .Lfunc_end0:
-          .size   big_mul_64x4, .Lfunc_end0-big_mul_64x4
-          .cfi_endproc
+  # var engine: ExecutionEngineRef
+  # initializeFullNativeTarget()
+  # createJITCompilerForModule(engine, asy.module, optLevel = 3)
 
-          .globl  hw_hi64
-          .p2align        4, 0x90
-          .type   hw_hi64,@function
-  hw_hi64:
-          movq    %rsi, %rax
-          retq
-  .Lfunc_end1:
-          .size   hw_hi64, .Lfunc_end1-hw_hi64
+  # let fn32 = cm32.genSymbol(opFpAdd)
+  # let fn64 = cm64.genSymbol(opFpAdd)
 
-          .globl  hw_lo64
-          .p2align        4, 0x90
-          .type   hw_lo64,@function
-  hw_lo64:
-          movq    %rdi, %rax
-          retq
-  .Lfunc_end2:
-          .size   hw_lo64, .Lfunc_end2-hw_lo64
+  # let jitFpAdd64 = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}](
+  #   engine.getFunctionAddress(cstring fn64)
+  # )
 
-          .globl  hw_mulExt64
-          .p2align        4, 0x90
-          .type   hw_mulExt64,@function
-  hw_mulExt64:
-          movq    %rsi, %rax
-          mulq    %rdi
-          retq
-  .Lfunc_end3:
-          .size   hw_mulExt64, .Lfunc_end3-hw_mulExt64
+  # var r: array[4, uint64]
+  # r.jitFpAdd64([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1])
+  # echo "jitFpAdd64 = ", r
 
-          .section        ".note.GNU-stack","",@progbits
+  # # block:
+  # #   Cleanup - Assembler_LLVM is auto-managed
+  # #   engine.dispose()  # also destroys the module attached to it, which double_frees Assembler_LLVM asy.module
+  # echo "LLVM JIT - calling FpAdd64 SUCCESS"
 
-  =========================================
-  ]#
+t_field_add()
diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim
index b3aa873f..af4de007 100644
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@@ -70,9 +70,9 @@ proc t_field_add(curve: static Algebra) =
   # Codegen
   # -------------------------
   let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, size32)
+  let cm32 = CurveMetadata.init(asy, curve, w32)
   asy.genFieldAddPTX(cm32)
-  let cm64 = CurveMetadata.init(asy, curve, size64)
+  let cm64 = CurveMetadata.init(asy, curve, w64)
   asy.genFieldAddPTX(cm64)
 
   let ptx = asy.codegenNvidiaPTX(sm)
@@ -124,9 +124,9 @@ proc t_field_sub(curve: static Algebra) =
   # Codegen
   # -------------------------
   let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, size32)
+  let cm32 = CurveMetadata.init(asy, curve, w32)
   asy.genFieldSubPTX(cm32)
-  let cm64 = CurveMetadata.init(asy, curve, size64)
+  let cm64 = CurveMetadata.init(asy, curve, w64)
   asy.genFieldSubPTX(cm64)
 
   let ptx = asy.codegenNvidiaPTX(sm)
@@ -178,14 +178,14 @@ proc t_field_mul(curve: static Algebra) =
   # Codegen
   # -------------------------
   let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, size32)
+  let cm32 = CurveMetadata.init(asy, curve, w32)
   asy.genFieldMulPTX(cm32)
 
   # 64-bit integer fused-multiply-add with carry is buggy:
   # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
   # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
 
-  # let cm64 = CurveMetadata.init(asy, curve, size64)
+  # let cm64 = CurveMetadata.init(asy, curve, w64)
   # asy.genFieldMulPTX(cm64)
 
   let ptx = asy.codegenNvidiaPTX(sm)