From 7b29fbdf5dfd1ed28ef3b2678b316a04dcb38153 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Fri, 8 Sep 2023 23:23:26 +0200
Subject: [PATCH] stash prep for Barret Reduction

---
 benchmarks/bench_gmp_modexp.nim               | 108 ++++++++
 benchmarks/bench_gmp_modmul.nim               | 182 ++++++++++++++
 constantine/math/arithmetic/bigints.nim       |   2 +-
 .../arithmetic/bigints_views.nim              |   6 +-
 .../{limbs_division.nim => limbs_divmod.nim}  |  22 +-
 .../arithmetic/limbs_divmod_vartime.nim       | 237 ++++++++++++++++++
 .../arithmetic/limbs_montgomery.nim           |  13 +-
 .../intrinsics/extended_precision_vartime.nim | 158 ++++++++++++
 .../t_bigints_mod.nim                         |   6 +-
 .../t_bigints_mod_vs_gmp.nim                  |  19 +-
 .../t_bigints_powmod_vs_gmp.nim               |   2 +-
 11 files changed, 729 insertions(+), 26 deletions(-)
 create mode 100644 benchmarks/bench_gmp_modexp.nim
 create mode 100644 benchmarks/bench_gmp_modmul.nim
 rename constantine/math_arbitrary_precision/arithmetic/{limbs_division.nim => limbs_divmod.nim} (92%)
 create mode 100644 constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim
 create mode 100644 constantine/platforms/intrinsics/extended_precision_vartime.nim

diff --git a/benchmarks/bench_gmp_modexp.nim b/benchmarks/bench_gmp_modexp.nim
new file mode 100644
index 00000000..341f81a6
--- /dev/null
+++ b/benchmarks/bench_gmp_modexp.nim
@@ -0,0 +1,108 @@
+import
+  ../constantine/math/arithmetic,
+  ../constantine/math/io/[io_bigints, io_fields],
+  ../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_montgomery, limbs_mod2k],
+  ../constantine/math/config/[type_bigint, curves, precompute],
+  ../constantine/platforms/abstractions,
+  ../constantine/serialization/codecs,
+  ../helpers/prng_unsafe,
+  std/[times, monotimes, strformat]
+
+import gmp
+# import stint
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian = -1'i32
+  GMP_WordNativeEndian = 0'i32
+  GMP_WordBigEndian = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst = -1'i32
+
+# let M = Mod(BN254_Snarks)
+const bits = 256
+const expBits = bits # Stint only supports same size args
+
+var rng: RngState
+rng.seed(1234)
+
+for i in 0 ..< 5:
+  echo "i: ", i
+  # -------------------------
+  let M = rng.random_long01Seq(BigInt[bits])
+  let a = rng.random_long01Seq(BigInt[bits])
+
+  var exponent = newSeq[byte](expBits div 8)
+  for i in 0 ..< expBits div 8:
+    exponent[i] = byte rng.next()
+
+  # -------------------------
+
+  let aHex = a.toHex()
+  let eHex = exponent.toHex()
+  let mHex = M.toHex()
+
+  echo "  base:     ", a.toHex()
+  echo "  exponent: ", exponent.toHex()
+  echo "  modulus:  ", M.toHex()
+
+  # -------------------------
+
+  var elapsedCtt, elapsedStint, elapsedGMP: int64
+
+  block:
+    var r: BigInt[bits]
+    let start = getMonotime()
+    r.limbs.powMod_vartime(a.limbs, exponent, M.limbs, window = 4)
+    let stop = getMonotime()
+
+    elapsedCtt = inNanoseconds(stop-start)
+
+    echo "  r Constantine:       ", r.toHex()
+    echo "  elapsed Constantine: ", elapsedCtt, " ns"
+
+  # -------------------------
+
+  # block:
+  #   let aa = Stuint[bits].fromHex(aHex)
+  #   let ee = Stuint[expBits].fromHex(eHex)
+  #   let mm = Stuint[bits].fromHex(mHex)
+
+  #   var r: Stuint[bits]
+  #   let start = getMonotime()
+  #   r = powmod(aa, ee, mm)
+  #   let stop = getMonotime()
+
+  #   elapsedStint = inNanoseconds(stop-start)
+
+  #   echo "  r stint:             ", r.toHex()
+  #   echo "  elapsed Stint:       ", elapsedStint, " ns"
+
+  block:
+    var aa, ee, mm, rr: mpz_t
+    mpz_init(aa)
+    mpz_init(ee)
+    mpz_init(mm)
+    mpz_init(rr)
+
+    aa.mpz_import(a.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, a.limbs[0].unsafeAddr)
+    let e = BigInt[expBits].unmarshal(exponent, bigEndian)
+    ee.mpz_import(e.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, e.limbs[0].unsafeAddr)
+    mm.mpz_import(M.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, M.limbs[0].unsafeAddr)
+
+    let start = getMonotime()
+    rr.mpz_powm(aa, ee, mm)
+    let stop = getMonotime()
+
+    elapsedGMP = inNanoSeconds(stop-start)
+
+    var r: BigInt[bits]
+    var rWritten: csize
+    discard r.limbs[0].addr.mpz_export(rWritten.addr, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, rr)
+
+    echo "  r GMP:               ", r.toHex()
+    echo "  elapsed GMP:         ", elapsedGMP, " ns"
+
+  # echo &"\n  ratio Stint/Constantine: {float64(elapsedStint)/float64(elapsedCtt):.3f}x"
+  echo &"  ratio GMP/Constantine: {float64(elapsedGMP)/float64(elapsedCtt):.3f}x"
+  echo "---------------------------------------------------------"
\ No newline at end of file
diff --git a/benchmarks/bench_gmp_modmul.nim b/benchmarks/bench_gmp_modmul.nim
new file mode 100644
index 00000000..5c0e8f29
--- /dev/null
+++ b/benchmarks/bench_gmp_modmul.nim
@@ -0,0 +1,182 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[macros, times, strutils, monotimes],
+  # Third-party
+  gmp,
+  # Internal
+  ../constantine/math/io/io_bigints,
+  ../constantine/math/arithmetic,
+  ../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime,
+  ../constantine/platforms/abstractions,
+  ../constantine/serialization/codecs,
+  # Test utilities
+  ../helpers/prng_unsafe
+
+echo "\n------------------------------------------------------\n"
+# We test up to 1024-bit, more is really slow
+
+macro testSizes(rBits, aBits, bBits, body: untyped): untyped =
+  ## Configure sizes known at compile-time to test against GMP
+  result = newStmtList()
+
+  for size in [256, 384, 121*8]:
+    let aBitsVal = size
+    let bBitsVal = size
+    let rBitsVal = size * 2
+
+    result.add quote do:
+      block:
+        const `aBits` = `aBitsVal`
+        const `bBits` = `bBitsVal`
+        const `rBits` = `rBitsVal`
+        block:
+          `body`
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian {.used.} = -1'i32
+  GMP_WordNativeEndian {.used.} = 0'i32
+  GMP_WordBigEndian {.used.} = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst {.used.} = -1'i32
+
+proc main() =
+  var rng: RngState
+  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(seed)
+  echo "\n------------------------------------------------------\n"
+  echo "rng xoshiro512** seed: ", seed
+  echo ""
+
+  var r, rMod, a, b: mpz_t
+  mpz_init(r)
+  mpz_init(rMod)
+  mpz_init(a)
+  mpz_init(b)
+
+  testSizes(rBits, aBits, bBits):
+    # echo "--------------------------------------------------------------------------------"
+    echo "Testing: mul  r (", align($rBits, 4), "-bit) <- a (", align($aBits, 4), "-bit) * b (", align($bBits, 4), "-bit)"
+
+    # Build the bigints
+    let aTest = rng.random_unsafe(BigInt[aBits])
+    var bTest = rng.random_unsafe(BigInt[bBits])
+
+    #########################################################
+    # Conversion to GMP
+    const aLen = (aBits + 7) div 8
+    const bLen = (bBits + 7) div 8
+
+    var aBuf: array[aLen, byte]
+    var bBuf: array[bLen, byte]
+
+    aBuf.marshal(aTest, bigEndian)
+    bBuf.marshal(bTest, bigEndian)
+
+    mpz_import(a, aLen, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, aBuf[0].addr)
+    mpz_import(b, bLen, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, bBuf[0].addr)
+
+    #########################################################
+    # Multiplication
+    const NumIters = 1000000
+
+    let startGMP = getMonoTime()
+    for _ in 0 ..< NumIters:
+      mpz_mul(r, a, b)
+    let stopGMP = getMonoTime()
+    echo "GMP         - ", aBits, "  x  ", bBits, "           -> ", rBits,  " mul: ", float(inNanoseconds((stopGMP-startGMP)))/float(NumIters), " ns"
+
+    # If a*b overflow the result size we truncate
+    const numWords = wordsRequired(rBits)
+    when numWords < wordsRequired(aBits+bBits):
+      echo "  truncating from ", wordsRequired(aBits+bBits), " words to ", numWords, " (2^", WordBitwidth * numWords, ")"
+      r.mpz_tdiv_r_2exp(r, WordBitwidth * numWords)
+
+    let startGMPmod = getMonoTime()
+    for _ in 0 ..< NumIters:
+      mpz_mod(rMod, a, b)
+    let stopGMPmod = getMonoTime()
+    echo "GMP         - ", aBits, " mod ", bBits, "           -> ", bBits,  " mod: ", float(inNanoseconds((stopGMPmod-startGMPmod)))/float(NumIters), " ns"
+
+    let startGMPmod2 = getMonoTime()
+    for _ in 0 ..< NumIters:
+      mpz_mod(rMod, r, b)
+    let stopGMPmod2 = getMonoTime()
+    echo "GMP         - ", rBits, " mod ", bBits, "           -> ", bBits,  " mod: ", float(inNanoseconds((stopGMPmod2-startGMPmod2)))/float(NumIters), " ns"
+
+
+    # Constantine
+    var rTest: BigInt[rBits]
+
+    let startCTT = getMonoTime()
+    for _ in 0 ..< NumIters:
+      rTest.prod(aTest, bTest)
+    let stopCTT = getMonoTime()
+    echo "Constantine - ", aBits, "  x  ", bBits, "           -> ", rBits,  " mul: ", float(inNanoseconds((stopCTT-startCTT)))/float(NumIters), " ns"
+
+    var rTestMod: BigInt[bBits]
+
+    let startCTTMod = getMonoTime()
+    for _ in 0 ..< NumIters:
+      rTestMod.reduce(aTest, bTest)
+    let stopCTTMod = getMonoTime()
+    echo "Constantine - ", aBits, " mod ", bBits, "           -> ", bBits,  " mod: ", float(inNanoseconds((stopCTTmod-startCTTmod)))/float(NumIters), " ns"
+
+    let startCTTvartimeMod = getMonoTime()
+    var q {.noInit.}: BigInt[bBits]
+    for _ in 0 ..< NumIters:
+      discard divRem_vartime(q.limbs, rTestMod.limbs, aTest.limbs, bTest.limbs)
+    let stopCTTvartimeMod = getMonoTime()
+    echo "Constantine - ", aBits, " mod ", bBits, " (vartime) -> ", bBits,  " mod: ", float(inNanoseconds((stopCTTvartimeMod-startCTTvartimeMod)))/float(NumIters), " ns"
+
+    let startCTTMod2 = getMonoTime()
+    for _ in 0 ..< NumIters:
+      rTestMod.reduce(rTest, bTest)
+    let stopCTTMod2 = getMonoTime()
+    echo "Constantine - ", rBits, " mod ", bBits, "           -> ", bBits,  " mod: ", float(inNanoseconds((stopCTTmod2-startCTTmod2)))/float(NumIters), " ns"
+
+    let startCTTvartimeMod2 = getMonoTime()
+    var q2 {.noInit.}: BigInt[bBits]
+    for _ in 0 ..< NumIters:
+      discard divRem_vartime(q2.limbs, rTestMod.limbs, rTest.limbs, bTest.limbs)
+    let stopCTTvartimeMod2 = getMonoTime()
+    echo "Constantine - ", rBits, " mod ", bBits, " (vartime) -> ", bBits,  " mod: ", float(inNanoseconds((stopCTTvartimeMod2-startCTTvartimeMod2)))/float(NumIters), " ns"
+
+    echo ""
+
+    #########################################################
+    # Check
+
+    {.push warnings: off.} # deprecated csize
+    var aW, bW, rW: csize  # Word written by GMP
+    {.pop.}
+
+    const rLen = numWords * WordBitWidth
+    var rGMP: array[rLen, byte]
+    discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r)
+
+    var rConstantine: array[rLen, byte]
+    marshal(rConstantine, rTest, bigEndian)
+
+    # Note: in bigEndian, GMP aligns left while constantine aligns right
+    doAssert rGMP.toOpenArray(0, rW-1) == rConstantine.toOpenArray(rLen-rW, rLen-1), block:
+      # Reexport as bigEndian for debugging
+      discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+      discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b)
+      "\nMultiplication with operands\n" &
+      "  a (" & align($aBits, 4) & "-bit):   " & aBuf.toHex & "\n" &
+      "  b (" & align($bBits, 4) & "-bit):   " & bBuf.toHex & "\n" &
+      "into r of size " & align($rBits, 4) & "-bit failed:" & "\n" &
+      "  GMP:            " & rGMP.toHex() & "\n" &
+      "  Constantine:    " & rConstantine.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while constantine aligns bytes right)"
+
+main()
diff --git a/constantine/math/arithmetic/bigints.nim b/constantine/math/arithmetic/bigints.nim
index 1dfe244c..a5df367c 100644
--- a/constantine/math/arithmetic/bigints.nim
+++ b/constantine/math/arithmetic/bigints.nim
@@ -12,7 +12,7 @@ import
   ./limbs,
   ./limbs_extmul,
   ./limbs_exgcd,
-  ../../math_arbitrary_precision/arithmetic/limbs_division
+  ../../math_arbitrary_precision/arithmetic/limbs_divmod
 
 export BigInt
 
diff --git a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
index 2c0f9384..ffff1062 100644
--- a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim
@@ -15,7 +15,7 @@ import
   ./limbs_mod2k,
   ./limbs_multiprec,
   ./limbs_extmul,
-  ./limbs_division
+  ./limbs_divmod_vartime
 
 # No exceptions allowed
 {.push raises: [], checks: off.}
@@ -61,6 +61,7 @@ func powOddMod_vartime*(
 
   if eBits == 1:
     r.view().reduce(a.view(), aBits, M.view(), mBits)
+    # discard r.reduce_vartime(a, M)
     return
 
   let L      = wordsRequired(mBits)
@@ -77,8 +78,9 @@ func powOddMod_vartime*(
     # For now, we call explicit reduction as it can handle all sizes.
     # TODO: explicit reduction uses constant-time division which is **very** expensive
     if a.len != M.len:
-      let t = allocStackArray(SecretWord, L)
+      var t = allocStackArray(SecretWord, L)
       t.LimbsViewMut.reduce(a.view(), aBits, M.view(), mBits)
+      # discard t.toOpenArray(0, L-1).reduce_vartime(a, M)
       rMont.LimbsViewMut.getMont(LimbsViewConst t, M.view(), LimbsViewConst r2.view(), m0ninv, mBits)
     else:
       rMont.LimbsViewMut.getMont(a.view(), M.view(), LimbsViewConst r2.view(), m0ninv, mBits)
diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim
similarity index 92%
rename from constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
rename to constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim
index aab1d902..03715c71 100644
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim
@@ -63,7 +63,7 @@ func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
     a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R)
     m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R)
 
-  # m0 has its high bit set. (a0, a1)/p0 fits in a limb.
+  # m0 has its high bit set. (a0, a1)/m0 fits in a limb.
   # Get a quotient q, at most we will be 2 iterations off
   # from the true quotient
   var q, r: SecretWord
@@ -78,29 +78,29 @@ func shlAddMod_estimate(a: LimbsViewMut, aLen: int,
 
   # Now substract a*2^64 - q*p
   var carry = Zero
-  var over_p = CtTrue                            # Track if quotient greater than the modulus
+  var overM = CtTrue                             # Track if quotient greater than the modulus
 
   for i in 0 ..< MLen:
-    var qp_lo: SecretWord
+    var qm_lo: SecretWord
 
-    block: # q*p
-      # q * p + carry (doubleword) carry from previous limb
-      muladd1(carry, qp_lo, q, M[i], carry)
+    block: # q*m
+      # q * m + carry (doubleword) carry from previous limb
+      muladd1(carry, qm_lo, q, M[i], carry)
 
     block: # a*2^64 - q*p
       var borrow: Borrow
-      subB(borrow, a[i], a[i], qp_lo, Borrow(0))
+      subB(borrow, a[i], a[i], qm_lo, Borrow(0))
       carry += SecretWord(borrow) # Adjust if borrow
 
-    over_p = mux(a[i] == M[i], over_p, a[i] > M[i])
+    overM = mux(a[i] == M[i], overM, a[i] > M[i])
 
   # Fix quotient, the true quotient is either q-1, q or q+1
   #
-  # if carry < q or carry == q and over_p we must do "a -= p"
-  # if carry > hi (negative result) we must do "a += p"
+  # if carry < q or carry == q and over_p we must do "a -= m"
+  # if carry > hi (negative result) we must do "a += m"
 
   result.neg = carry > hi
-  result.tooBig = not(result.neg) and (over_p or (carry < hi))
+  result.tooBig = not(result.neg) and (overM or (carry < hi))
 
 func shlAddMod(a: LimbsViewMut, aLen: int,
                c: SecretWord, M: LimbsViewConst, mBits: int) =
diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim
new file mode 100644
index 00000000..36d22e6f
--- /dev/null
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim
@@ -0,0 +1,237 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../../platforms/abstractions,
+  ../../platforms/intrinsics/extended_precision_vartime,
+  ./limbs_views,
+  ./limbs_fixedprec
+
+# No exceptions allowed
+{.push raises: [].}
+
+# ############################################################
+#
+#              Division and Modular Reduction
+#                    (variable-time)
+#
+# ############################################################
+
+func shlAddMod_multiprec_vartime(
+       a: var openArray[SecretWord], c: SecretWord,
+       M: openArray[SecretWord], mBits: int): SecretWord {.meter.} =
+  ## Fused modular left-shift + add
+  ## Computes: a <- a shl 2ʷ + c (mod M)
+  ## Returns: (a shl 2ʷ + c) / M
+  ##
+  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  ##
+  ## Specialized for M being a multi-precision integer.
+                                        # Assuming 64-bit words
+  let hi = a[^1]                        # Save the high word to detect carries
+  let R = mBits and (WordBitWidth - 1)  # R = mBits mod 64
+
+  var a0, a1, m0: SecretWord
+  if R == 0:                            # If the number of mBits is a multiple of 64
+    a0 = a[^1]                          #
+    copyWords(a.view(), 1,              # we can just shift words
+              a.view(), 0, a.len-1)     #
+    a[0] = c                            # and replace the first one by c
+    a1 = a[^1]
+    m0 = M[^1]
+  else:                                 # Else: need to deal with partial word shifts at the edge.
+    let clz = WordBitWidth-R
+    a0 = (a[^1] shl clz) or (a[^2] shr R)
+    copyWords(a.view(), 1,
+              a.view(), 0, a.len-1)
+    a[0] = c
+    a1 = (a[^1] shl clz) or (a[^2] shr R)
+    m0 = (M[^1] shl clz) or (M[^2] shr R)
+
+  # m0 has its high bit set. (a0, a1)/m0 fits in a limb.
+  # Get a quotient q, at most we will be 2 iterations off
+  # from the true quotient
+  var q: SecretWord                     # Estimate quotient
+  if bool(a0 == m0):                    # if a_hi == divisor
+    q = MaxWord                         # quotient = MaxWord (0b1111...1111)
+  elif bool(a0.isZero()) and
+       bool(a1 < m0):                   # elif q == 0, true quotient = 0
+    q = Zero
+    return q
+  else:
+    var r: SecretWord
+    div2n1n(q, r, a0, a1, m0)           # else instead of being of by 0, 1 or 2
+    q -= One                            # we return q-1 to be off by -1, 0 or 1
+
+  # Now substract a*2^64 - q*m
+  var carry = Zero
+  var overM = true                      # Track if quotient greater than the modulus
+
+  for i in 0 ..< M.len:
+    var qm_lo: SecretWord
+    block:                              # q*m
+      # q * m + carry (doubleword) carry from previous limb
+      muladd1(carry, qm_lo, q, M[i], carry)
+
+    block:                              # a*2^64 - q*m
+      var borrow: Borrow
+      subB(borrow, a[i], a[i], qm_lo, Borrow(0))
+      carry += SecretWord(borrow) # Adjust if borrow
+
+    if bool(a[i] != M[i]):
+      overM = bool(a[i] > M[i])
+
+  # Fix quotient, the true quotient is either q-1, q or q+1
+  #
+  # if carry < q or carry == q and overM we must do "a -= M"
+  # if carry > hi (negative result) we must do "a += M"
+  if bool(carry > hi):
+    var c = Carry(0)
+    for i in 0 ..< a.len:
+      addC(c, a[i], a[i], M[i], c)
+    q -= One
+  elif overM or bool(carry < hi):
+    var b = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(b, a[i], a[i], M[i], b)
+    q += One
+
+  return q
+
+func shlAddMod_vartime(a: var openArray[SecretWord], c: SecretWord,
+                       M: openArray[SecretWord], mBits: int): SecretWord {.meter.} =
+  ## Fused modular left-shift + add
+  ## Computes: a <- a shl 2ʷ + c (mod M)
+  ## Returns: (a shl 2ʷ + c) / M
+  ##
+  ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  if mBits <= WordBitWidth:
+    # If M fits in a single limb
+
+    # We normalize M with clz so that the MSB is set
+    # And normalize (a * 2^64 + c) by R as well to maintain the result
+    # This ensures that (a0, a1)/m0 fits in a limb.
+    let R = mBits and (WordBitWidth - 1)
+
+    # (hi, lo) = a * 2^64 + c
+    if R == 0:
+      # We can delegate this R == 0 case to the
+      # shlAddMod_multiprec, with the same result.
+      # But isn't it faster to handle it here?
+      var q, r: SecretWord
+      div2n1n_vartime(q, r, a[0], c, M[0])
+      a[0] = r
+      return q
+    else:
+      let clz = WordBitWidth-R
+      let hi = (a[0] shl clz) or (c shr R)
+      let lo = c shl clz
+      let m0 = M[0] shl clz
+
+      var q, r: SecretWord
+      div2n1n(q, r, hi, lo, m0)
+      a[0] = r shr clz
+      return q
+  else:
+    return shlAddMod_multiprec_vartime(a, c, M, mBits)
+
+func divRem_vartime*(
+       q, r: var openArray[SecretWord],
+       a, b: openArray[SecretWord]): bool {.meter.} =
+  # q = a div b
+  # r = a mod b
+  #
+  # Requirements:
+  # b != 0
+  # q.len > a.len - b.len
+  # r.len >= b.len
+
+  let aBits = getBits_LE_vartime(a)
+  let bBits = getBits_LE_vartime(b)
+  let aLen = wordsRequired(aBits)
+  let bLen = wordsRequired(bBits)
+  let rLen = bLen
+
+  let aOffset = a.len - b.len
+
+  # Note: don't confuse a.len and aLen (actually used words)
+
+  if unlikely(bool(r.len < bLen)):
+    # remainder buffer cannot store up to modulus size
+    return false
+
+  if unlikely(bool(q.len < aOffset+1)):
+    # quotient buffer cannot store up to quotient size
+    return false
+
+  if unlikely(bBits == 0):
+    # Divide by zero
+    return false
+
+  if aBits < bBits:
+    # if a uses less bits than b,
+    # a < b, so q = 0 and r = a
+    copyWords(r.view(), 0, a.view(), 0, aLen)
+    for i in aLen ..< r.len:
+      r[i] = Zero
+    for i in 0 ..< q.len:
+      q[i] = Zero
+  else:
+    # The length of a is at least the divisor
+    # We can copy bLen-1 words
+    # and modular shift-left-add the rest
+
+    copyWords(r.view(), 0, a.view(), aOffset+1, b.len-1)
+    r[rLen-1] = Zero
+    # Now shift-left the copied words while adding the new word mod b
+
+    for i in countdown(aOffset, 0):
+      q[i] = shlAddMod_multiprec_vartime(
+              r.toOpenArray(0, rLen-1),
+              a[i],
+              b.toOpenArray(0, bLen-1),
+              bBits)
+
+    # Clean up extra words
+    for i in aOffset+1 ..< q.len:
+      q[i] = Zero
+    for i in rLen ..< r.len:
+      r[i] = Zero
+
+  return true
+
+func reduce_vartime*(r: var openArray[SecretWord],
+                     a, b: openArray[SecretWord]): bool {.noInline, meter.} =
+  let aBits = getBits_LE_vartime(a)
+  let bBits = getBits_LE_vartime(b)
+  let aLen = wordsRequired(aBits)
+  let bLen = wordsRequired(bBits)
+
+  let aOffset = a.len - b.len
+  var qBuf = allocHeapArray(SecretWord, aOffset+1)
+  template q: untyped = qBuf.toOpenArray(0, aOffset)
+  result = divRem_vartime(q, r, a, b)
+  freeHeap(qBuf)
+
+# ############################################################
+#
+#                    Barrett Reduction
+#
+# ############################################################
+
+# - https://en.wikipedia.org/wiki/Barrett_reduction
+# - Handbook of Applied Cryptography
+#   Alfred J. Menezes, Paul C. van Oorschot and Scott A. Vanstone
+#   https://cacr.uwaterloo.ca/hac/about/chap14.pdf
+# - Modern Computer Arithmetic
+#   Richard P. Brent and Paul Zimmermann
+#   https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
index 8985468e..939d5419 100644
--- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
+++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim
@@ -12,7 +12,7 @@ import
   ./limbs_views,
   ./limbs_mod,
   ./limbs_fixedprec,
-  ./limbs_division
+  ./limbs_divmod_vartime
 
 # No exceptions allowed
 {.push raises: [], checks: off.}
@@ -72,13 +72,13 @@ func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.
 
   # r.r_powmod_vartime(M, 1)
 
-  let mBits = getBits_LE_vartime(M)
-
   let t = allocStackArray(SecretWord, M.len + 1)
   zeroMem(t, M.len*sizeof(SecretWord))
   t[M.len] = One
 
+  let mBits = getBits_LE_vartime(M)
   r.view().reduce(LimbsViewMut t, M.len*WordBitWidth+1, M.view(), mBits)
+  # discard r.reduce_vartime(t.toOpenArray(0, M.len), M)
 
 func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} =
   ## Returns the Montgomery domain magic constant for the input modulus:
@@ -90,14 +90,13 @@ func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter
 
   # r.r_powmod_vartime(M, 2)
 
-  let mBits = getBits_LE_vartime(M)
-
   let t = allocStackArray(SecretWord, 2*M.len + 1)
   zeroMem(t, 2*M.len*sizeof(SecretWord))
   t[2*M.len] = One
 
-  r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M.view(), mBits)
-
+  let mBits = getBits_LE_vartime(M)
+  r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M, mBits)
+  # discard r.reduce_vartime(t.toOpenArray(0, 2*M.len), M)
 
 # Montgomery multiplication
 # ------------------------------------------
diff --git a/constantine/platforms/intrinsics/extended_precision_vartime.nim b/constantine/platforms/intrinsics/extended_precision_vartime.nim
new file mode 100644
index 00000000..7f6b8300
--- /dev/null
+++ b/constantine/platforms/intrinsics/extended_precision_vartime.nim
@@ -0,0 +1,158 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#              Variable-time extended-precision
+#                    compiler intrinsics
+#
+# ############################################################
+
+import ../abstractions
+
+func div2n1n_nim_vartime[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) {.tags:[VarTime].}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
+
+  const
+    size = sizeof(q) * 8
+    halfSize = size div 2
+    halfMask = (1.T shl halfSize) - 1.T
+
+  func halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q, r: T] {.nimcall.} =
+
+    var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
+    let m = q * d_lo
+    r = (r shl halfSize) or n_lo
+
+    # Fix the reminder, we're at most 2 iterations off
+    if r < m:
+      dec q
+      r += d
+      if r >= d and r < m:
+        dec q
+        r += d
+    r -= m
+    (q, r)
+
+  let
+    d_hi = d shr halfSize
+    d_lo = d and halfMask
+    n_lohi = n_lo shr halfSize
+    n_lolo = n_lo and halfMask
+
+  # First half of the quotient
+  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
+
+  # Second half
+  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
+
+  q = (q1 shl halfSize) or q2
+  r = r2
+
+
+when UseASM_X86_64:
+  func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline, tags:[VarTime].}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+
+    # DIV r/m64
+    # Divide RDX:RAX (n_hi:n_lo) by r/m64
+    #
+    # Inputs
+    #   - numerator high word in RDX,
+    #   - numerator low word in RAX,
+    #   - divisor as r/m parameter (register or memory at the compiler discretion)
+    # Result
+    #   - Quotient in RAX
+    #   - Remainder in RDX
+
+    # 1. name the register/memory "divisor"
+    # 2. don't forget to dereference the var hidden pointer
+    # 3. -
+    # 4. no clobbered registers beside explicitly used RAX and RDX
+    when defined(cpp):
+      asm """
+        div %[divisor]
+        :"=rax" (`q`), "=rdx" (`r`)
+        :"rdx" (`n_hi`), "rax" (`n_lo`), [divisor] "rm" (`d`)
+      """
+    else:
+      asm """
+        div %[divisor]
+        :"=rax" (*`q`), "=rdx" (*`r`)
+        :"rdx" (`n_hi`), "rax" (`n_lo`), [divisor] "rm" (`d`)
+      """
+
+elif sizeof(int) == 8 and defined(vcc):
+  func udiv128_vartime(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "<intrin.h>", nodecl, tags:[VarTime].}
+    ## Division 128 by 64, Microsoft only, 64-bit only,
+    ## returns quotient as return value remainder as var parameter
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+
+  func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+      ## Division uint128 by uint64
+      ## Warning ⚠️ :
+      ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+      ##   - if n_hi > d result is undefined
+      q = udiv128_vartime(n_hi, n_lo, d, r)
+
+elif sizeof(int) == 8 and GCC_Compatible:
+  type
+    uint128{.importc: "unsigned __int128".} = object
+
+  const
+    newerNim = (NimMajor, NimMinor) > (1, 6)
+    noExplicitPtrDeref = defined(cpp) or newerNim
+
+  func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline, tags:[VarTime].}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
+    ##   - if n_hi > d result is undefined
+    var dblPrec {.noinit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when noExplicitPtrDeref:
+      {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
+      {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
+    else:
+      {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
+      {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
+
+
+func div2n1n_vartime*(q, r: var SecretWord, n_hi, n_lo, d: SecretWord) {.inline.} =
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+  ##
+  ## To avoid issues, n_hi, n_lo, d should be normalized.
+  ## i.e. shifted (== multiplied by the same power of 2)
+  ## so that the most significant bit in d is set.
+  when sizeof(int) == 4:
+    let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
+    let divisor = uint64(d)
+    q = uint32(dividend div divisor)
+    r = uint32(dividend mod divisor)
+  when nimvm:
+    div2n1n_nim_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d)
+  else:
+    when declared(div2n1n_128_vartime):
+      div2n1n_128_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d)
+    else:
+      div2n1n_nim_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d)
diff --git a/tests/math_arbitrary_precision/t_bigints_mod.nim b/tests/math_arbitrary_precision/t_bigints_mod.nim
index a7ca91c8..b732dcce 100644
--- a/tests/math_arbitrary_precision/t_bigints_mod.nim
+++ b/tests/math_arbitrary_precision/t_bigints_mod.nim
@@ -3,14 +3,16 @@
 import
   ../../constantine/math/[
     arithmetic,
-    io/io_bigints]
+    io/io_bigints],
+  ../../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime
 
 let a = BigInt[64].fromUint(0xa0e5cb56a1c08396'u64)
 let M = BigInt[64].fromUint(0xae57180eceb0206f'u64)
 
-var r: BigInt[64]
+var r, r2: BigInt[64]
 
 r.reduce(a, M)
+doAssert r2.limbs.reduce_vartime(a.limbs, M.limbs)
 
 let rU64 = 0xa0e5cb56a1c08396'u64 mod 0xae57180eceb0206f'u64
 echo r.toHex()
diff --git a/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim b/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim
index aada368a..ef41b7ef 100644
--- a/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim
+++ b/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim
@@ -15,6 +15,7 @@ import
   ../../constantine/math/[arithmetic, io/io_bigints],
   ../../constantine/platforms/primitives,
   ../../constantine/serialization/codecs,
+  ../../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime,
   # Test utilities
   ../../helpers/prng_unsafe
 
@@ -126,8 +127,9 @@ proc main() =
     # Modulus
     mpz_mod(r, a, m)
 
-    var rTest: BigInt[mBits]
+    var rTest, rTest_vartime: BigInt[mBits]
     rTest.reduce(aTest, mTest)
+    doAssert rTest_vartime.limbs.reduce_vartime(aTest.limbs, mTest.limbs)
 
     #########################################################
     # Check
@@ -139,8 +141,9 @@ proc main() =
     var rGMP: array[mLen, byte]
     discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r)
 
-    var rConstantine: array[mLen, byte]
+    var rConstantine, rCttVartime: array[mLen, byte]
     marshal(rConstantine, rTest, bigEndian)
+    marshal(rCttVartime, rTest_vartime, bigEndian)
 
     # echo "rGMP: ", rGMP.toHex()
     # echo "rConstantine: ", rConstantine.toHex()
@@ -158,4 +161,16 @@ proc main() =
       "  Constantine:    " & rConstantine.toHex() & "\n" &
       "(Note that GMP aligns bytes left while constantine aligns bytes right)"
 
+    doAssert rGMP.toOpenArray(0, rW-1) == rCttVartime.toOpenArray(mLen-rW, mLen-1), block:
+      # Reexport as bigEndian for debugging
+      discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a)
+      discard mpz_export(mBuf[0].addr, mW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m)
+      "\nModulus with operands\n" &
+      "  a (" & align($aBits, 4) & "-bit):   " & aBuf.toHex & "\n" &
+      "  m (" & align($mBits, 4) & "-bit):   " & mBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            " & rGMP.toHex() & "\n" &
+      "  Constantine:    " & rCttVartime.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while constantine aligns bytes right)"
+
 main()
diff --git a/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim b/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim
index d0465bc9..98c31aed 100644
--- a/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim
+++ b/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_division],
+  ../../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_divmod],
   ../../constantine/platforms/abstractions,
   ../../helpers/prng_unsafe,