From 7b29fbdf5dfd1ed28ef3b2678b316a04dcb38153 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 8 Sep 2023 23:23:26 +0200 Subject: [PATCH] stash prep for Barret Reduction --- benchmarks/bench_gmp_modexp.nim | 108 ++++++++ benchmarks/bench_gmp_modmul.nim | 182 ++++++++++++++ constantine/math/arithmetic/bigints.nim | 2 +- .../arithmetic/bigints_views.nim | 6 +- .../{limbs_division.nim => limbs_divmod.nim} | 22 +- .../arithmetic/limbs_divmod_vartime.nim | 237 ++++++++++++++++++ .../arithmetic/limbs_montgomery.nim | 13 +- .../intrinsics/extended_precision_vartime.nim | 158 ++++++++++++ .../t_bigints_mod.nim | 6 +- .../t_bigints_mod_vs_gmp.nim | 19 +- .../t_bigints_powmod_vs_gmp.nim | 2 +- 11 files changed, 729 insertions(+), 26 deletions(-) create mode 100644 benchmarks/bench_gmp_modexp.nim create mode 100644 benchmarks/bench_gmp_modmul.nim rename constantine/math_arbitrary_precision/arithmetic/{limbs_division.nim => limbs_divmod.nim} (92%) create mode 100644 constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim create mode 100644 constantine/platforms/intrinsics/extended_precision_vartime.nim diff --git a/benchmarks/bench_gmp_modexp.nim b/benchmarks/bench_gmp_modexp.nim new file mode 100644 index 00000000..341f81a6 --- /dev/null +++ b/benchmarks/bench_gmp_modexp.nim @@ -0,0 +1,108 @@ +import + ../constantine/math/arithmetic, + ../constantine/math/io/[io_bigints, io_fields], + ../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_montgomery, limbs_mod2k], + ../constantine/math/config/[type_bigint, curves, precompute], + ../constantine/platforms/abstractions, + ../constantine/serialization/codecs, + ../helpers/prng_unsafe, + std/[times, monotimes, strformat] + +import gmp +# import stint + +const # https://gmplib.org/manual/Integer-Import-and-Export.html + GMP_WordLittleEndian = -1'i32 + GMP_WordNativeEndian = 0'i32 + GMP_WordBigEndian = 1'i32 + + GMP_MostSignificantWordFirst = 1'i32 + GMP_LeastSignificantWordFirst = -1'i32 + +# let M = Mod(BN254_Snarks) +const bits = 256 +const expBits = bits # Stint only supports same size args + +var rng: RngState +rng.seed(1234) + +for i in 0 ..< 5: + echo "i: ", i + # ------------------------- + let M = rng.random_long01Seq(BigInt[bits]) + let a = rng.random_long01Seq(BigInt[bits]) + + var exponent = newSeq[byte](expBits div 8) + for i in 0 ..< expBits div 8: + exponent[i] = byte rng.next() + + # ------------------------- + + let aHex = a.toHex() + let eHex = exponent.toHex() + let mHex = M.toHex() + + echo " base: ", a.toHex() + echo " exponent: ", exponent.toHex() + echo " modulus: ", M.toHex() + + # ------------------------- + + var elapsedCtt, elapsedStint, elapsedGMP: int64 + + block: + var r: BigInt[bits] + let start = getMonotime() + r.limbs.powMod_vartime(a.limbs, exponent, M.limbs, window = 4) + let stop = getMonotime() + + elapsedCtt = inNanoseconds(stop-start) + + echo " r Constantine: ", r.toHex() + echo " elapsed Constantine: ", elapsedCtt, " ns" + + # ------------------------- + + # block: + # let aa = Stuint[bits].fromHex(aHex) + # let ee = Stuint[expBits].fromHex(eHex) + # let mm = Stuint[bits].fromHex(mHex) + + # var r: Stuint[bits] + # let start = getMonotime() + # r = powmod(aa, ee, mm) + # let stop = getMonotime() + + # elapsedStint = inNanoseconds(stop-start) + + # echo " r stint: ", r.toHex() + # echo " elapsed Stint: ", elapsedStint, " ns" + + block: + var aa, ee, mm, rr: mpz_t + mpz_init(aa) + mpz_init(ee) + mpz_init(mm) + mpz_init(rr) + + aa.mpz_import(a.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, a.limbs[0].unsafeAddr) + let e = BigInt[expBits].unmarshal(exponent, bigEndian) + ee.mpz_import(e.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, e.limbs[0].unsafeAddr) + mm.mpz_import(M.limbs.len, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, M.limbs[0].unsafeAddr) + + let start = getMonotime() + rr.mpz_powm(aa, ee, mm) + let stop = getMonotime() + + elapsedGMP = inNanoSeconds(stop-start) + + var r: BigInt[bits] + var rWritten: csize + discard r.limbs[0].addr.mpz_export(rWritten.addr, GMP_LeastSignificantWordFirst, sizeof(SecretWord), GMP_WordNativeEndian, 0, rr) + + echo " r GMP: ", r.toHex() + echo " elapsed GMP: ", elapsedGMP, " ns" + + # echo &"\n ratio Stint/Constantine: {float64(elapsedStint)/float64(elapsedCtt):.3f}x" + echo &" ratio GMP/Constantine: {float64(elapsedGMP)/float64(elapsedCtt):.3f}x" + echo "---------------------------------------------------------" \ No newline at end of file diff --git a/benchmarks/bench_gmp_modmul.nim b/benchmarks/bench_gmp_modmul.nim new file mode 100644 index 00000000..5c0e8f29 --- /dev/null +++ b/benchmarks/bench_gmp_modmul.nim @@ -0,0 +1,182 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Standard library + std/[macros, times, strutils, monotimes], + # Third-party + gmp, + # Internal + ../constantine/math/io/io_bigints, + ../constantine/math/arithmetic, + ../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime, + ../constantine/platforms/abstractions, + ../constantine/serialization/codecs, + # Test utilities + ../helpers/prng_unsafe + +echo "\n------------------------------------------------------\n" +# We test up to 1024-bit, more is really slow + +macro testSizes(rBits, aBits, bBits, body: untyped): untyped = + ## Configure sizes known at compile-time to test against GMP + result = newStmtList() + + for size in [256, 384, 121*8]: + let aBitsVal = size + let bBitsVal = size + let rBitsVal = size * 2 + + result.add quote do: + block: + const `aBits` = `aBitsVal` + const `bBits` = `bBitsVal` + const `rBits` = `rBitsVal` + block: + `body` + +const # https://gmplib.org/manual/Integer-Import-and-Export.html + GMP_WordLittleEndian {.used.} = -1'i32 + GMP_WordNativeEndian {.used.} = 0'i32 + GMP_WordBigEndian {.used.} = 1'i32 + + GMP_MostSignificantWordFirst = 1'i32 + GMP_LeastSignificantWordFirst {.used.} = -1'i32 + +proc main() = + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo "rng xoshiro512** seed: ", seed + echo "" + + var r, rMod, a, b: mpz_t + mpz_init(r) + mpz_init(rMod) + mpz_init(a) + mpz_init(b) + + testSizes(rBits, aBits, bBits): + # echo "--------------------------------------------------------------------------------" + echo "Testing: mul r (", align($rBits, 4), "-bit) <- a (", align($aBits, 4), "-bit) * b (", align($bBits, 4), "-bit)" + + # Build the bigints + let aTest = rng.random_unsafe(BigInt[aBits]) + var bTest = rng.random_unsafe(BigInt[bBits]) + + ######################################################### + # Conversion to GMP + const aLen = (aBits + 7) div 8 + const bLen = (bBits + 7) div 8 + + var aBuf: array[aLen, byte] + var bBuf: array[bLen, byte] + + aBuf.marshal(aTest, bigEndian) + bBuf.marshal(bTest, bigEndian) + + mpz_import(a, aLen, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, aBuf[0].addr) + mpz_import(b, bLen, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, bBuf[0].addr) + + ######################################################### + # Multiplication + const NumIters = 1000000 + + let startGMP = getMonoTime() + for _ in 0 ..< NumIters: + mpz_mul(r, a, b) + let stopGMP = getMonoTime() + echo "GMP - ", aBits, " x ", bBits, " -> ", rBits, " mul: ", float(inNanoseconds((stopGMP-startGMP)))/float(NumIters), " ns" + + # If a*b overflow the result size we truncate + const numWords = wordsRequired(rBits) + when numWords < wordsRequired(aBits+bBits): + echo " truncating from ", wordsRequired(aBits+bBits), " words to ", numWords, " (2^", WordBitwidth * numWords, ")" + r.mpz_tdiv_r_2exp(r, WordBitwidth * numWords) + + let startGMPmod = getMonoTime() + for _ in 0 ..< NumIters: + mpz_mod(rMod, a, b) + let stopGMPmod = getMonoTime() + echo "GMP - ", aBits, " mod ", bBits, " -> ", bBits, " mod: ", float(inNanoseconds((stopGMPmod-startGMPmod)))/float(NumIters), " ns" + + let startGMPmod2 = getMonoTime() + for _ in 0 ..< NumIters: + mpz_mod(rMod, r, b) + let stopGMPmod2 = getMonoTime() + echo "GMP - ", rBits, " mod ", bBits, " -> ", bBits, " mod: ", float(inNanoseconds((stopGMPmod2-startGMPmod2)))/float(NumIters), " ns" + + + # Constantine + var rTest: BigInt[rBits] + + let startCTT = getMonoTime() + for _ in 0 ..< NumIters: + rTest.prod(aTest, bTest) + let stopCTT = getMonoTime() + echo "Constantine - ", aBits, " x ", bBits, " -> ", rBits, " mul: ", float(inNanoseconds((stopCTT-startCTT)))/float(NumIters), " ns" + + var rTestMod: BigInt[bBits] + + let startCTTMod = getMonoTime() + for _ in 0 ..< NumIters: + rTestMod.reduce(aTest, bTest) + let stopCTTMod = getMonoTime() + echo "Constantine - ", aBits, " mod ", bBits, " -> ", bBits, " mod: ", float(inNanoseconds((stopCTTmod-startCTTmod)))/float(NumIters), " ns" + + let startCTTvartimeMod = getMonoTime() + var q {.noInit.}: BigInt[bBits] + for _ in 0 ..< NumIters: + discard divRem_vartime(q.limbs, rTestMod.limbs, aTest.limbs, bTest.limbs) + let stopCTTvartimeMod = getMonoTime() + echo "Constantine - ", aBits, " mod ", bBits, " (vartime) -> ", bBits, " mod: ", float(inNanoseconds((stopCTTvartimeMod-startCTTvartimeMod)))/float(NumIters), " ns" + + let startCTTMod2 = getMonoTime() + for _ in 0 ..< NumIters: + rTestMod.reduce(rTest, bTest) + let stopCTTMod2 = getMonoTime() + echo "Constantine - ", rBits, " mod ", bBits, " -> ", bBits, " mod: ", float(inNanoseconds((stopCTTmod2-startCTTmod2)))/float(NumIters), " ns" + + let startCTTvartimeMod2 = getMonoTime() + var q2 {.noInit.}: BigInt[bBits] + for _ in 0 ..< NumIters: + discard divRem_vartime(q2.limbs, rTestMod.limbs, rTest.limbs, bTest.limbs) + let stopCTTvartimeMod2 = getMonoTime() + echo "Constantine - ", rBits, " mod ", bBits, " (vartime) -> ", bBits, " mod: ", float(inNanoseconds((stopCTTvartimeMod2-startCTTvartimeMod2)))/float(NumIters), " ns" + + echo "" + + ######################################################### + # Check + + {.push warnings: off.} # deprecated csize + var aW, bW, rW: csize # Word written by GMP + {.pop.} + + const rLen = numWords * WordBitWidth + var rGMP: array[rLen, byte] + discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r) + + var rConstantine: array[rLen, byte] + marshal(rConstantine, rTest, bigEndian) + + # Note: in bigEndian, GMP aligns left while constantine aligns right + doAssert rGMP.toOpenArray(0, rW-1) == rConstantine.toOpenArray(rLen-rW, rLen-1), block: + # Reexport as bigEndian for debugging + discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a) + discard mpz_export(bBuf[0].addr, bW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, b) + "\nMultiplication with operands\n" & + " a (" & align($aBits, 4) & "-bit): " & aBuf.toHex & "\n" & + " b (" & align($bBits, 4) & "-bit): " & bBuf.toHex & "\n" & + "into r of size " & align($rBits, 4) & "-bit failed:" & "\n" & + " GMP: " & rGMP.toHex() & "\n" & + " Constantine: " & rConstantine.toHex() & "\n" & + "(Note that GMP aligns bytes left while constantine aligns bytes right)" + +main() diff --git a/constantine/math/arithmetic/bigints.nim b/constantine/math/arithmetic/bigints.nim index 1dfe244c..a5df367c 100644 --- a/constantine/math/arithmetic/bigints.nim +++ b/constantine/math/arithmetic/bigints.nim @@ -12,7 +12,7 @@ import ./limbs, ./limbs_extmul, ./limbs_exgcd, - ../../math_arbitrary_precision/arithmetic/limbs_division + ../../math_arbitrary_precision/arithmetic/limbs_divmod export BigInt diff --git a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim index 2c0f9384..ffff1062 100644 --- a/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim +++ b/constantine/math_arbitrary_precision/arithmetic/bigints_views.nim @@ -15,7 +15,7 @@ import ./limbs_mod2k, ./limbs_multiprec, ./limbs_extmul, - ./limbs_division + ./limbs_divmod_vartime # No exceptions allowed {.push raises: [], checks: off.} @@ -61,6 +61,7 @@ func powOddMod_vartime*( if eBits == 1: r.view().reduce(a.view(), aBits, M.view(), mBits) + # discard r.reduce_vartime(a, M) return let L = wordsRequired(mBits) @@ -77,8 +78,9 @@ func powOddMod_vartime*( # For now, we call explicit reduction as it can handle all sizes. # TODO: explicit reduction uses constant-time division which is **very** expensive if a.len != M.len: - let t = allocStackArray(SecretWord, L) + var t = allocStackArray(SecretWord, L) t.LimbsViewMut.reduce(a.view(), aBits, M.view(), mBits) + # discard t.toOpenArray(0, L-1).reduce_vartime(a, M) rMont.LimbsViewMut.getMont(LimbsViewConst t, M.view(), LimbsViewConst r2.view(), m0ninv, mBits) else: rMont.LimbsViewMut.getMont(a.view(), M.view(), LimbsViewConst r2.view(), m0ninv, mBits) diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim similarity index 92% rename from constantine/math_arbitrary_precision/arithmetic/limbs_division.nim rename to constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim index aab1d902..03715c71 100644 --- a/constantine/math_arbitrary_precision/arithmetic/limbs_division.nim +++ b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod.nim @@ -63,7 +63,7 @@ func shlAddMod_estimate(a: LimbsViewMut, aLen: int, a1 = (a[^1] shl (WordBitWidth-R)) or (a[^2] shr R) m0 = (M[^1] shl (WordBitWidth-R)) or (M[^2] shr R) - # m0 has its high bit set. (a0, a1)/p0 fits in a limb. + # m0 has its high bit set. (a0, a1)/m0 fits in a limb. # Get a quotient q, at most we will be 2 iterations off # from the true quotient var q, r: SecretWord @@ -78,29 +78,29 @@ func shlAddMod_estimate(a: LimbsViewMut, aLen: int, # Now substract a*2^64 - q*p var carry = Zero - var over_p = CtTrue # Track if quotient greater than the modulus + var overM = CtTrue # Track if quotient greater than the modulus for i in 0 ..< MLen: - var qp_lo: SecretWord + var qm_lo: SecretWord - block: # q*p - # q * p + carry (doubleword) carry from previous limb - muladd1(carry, qp_lo, q, M[i], carry) + block: # q*m + # q * m + carry (doubleword) carry from previous limb + muladd1(carry, qm_lo, q, M[i], carry) block: # a*2^64 - q*p var borrow: Borrow - subB(borrow, a[i], a[i], qp_lo, Borrow(0)) + subB(borrow, a[i], a[i], qm_lo, Borrow(0)) carry += SecretWord(borrow) # Adjust if borrow - over_p = mux(a[i] == M[i], over_p, a[i] > M[i]) + overM = mux(a[i] == M[i], overM, a[i] > M[i]) # Fix quotient, the true quotient is either q-1, q or q+1 # - # if carry < q or carry == q and over_p we must do "a -= p" - # if carry > hi (negative result) we must do "a += p" + # if carry < q or carry == q and over_p we must do "a -= m" + # if carry > hi (negative result) we must do "a += m" result.neg = carry > hi - result.tooBig = not(result.neg) and (over_p or (carry < hi)) + result.tooBig = not(result.neg) and (overM or (carry < hi)) func shlAddMod(a: LimbsViewMut, aLen: int, c: SecretWord, M: LimbsViewConst, mBits: int) = diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim new file mode 100644 index 00000000..36d22e6f --- /dev/null +++ b/constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime.nim @@ -0,0 +1,237 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + ../../platforms/abstractions, + ../../platforms/intrinsics/extended_precision_vartime, + ./limbs_views, + ./limbs_fixedprec + +# No exceptions allowed +{.push raises: [].} + +# ############################################################ +# +# Division and Modular Reduction +# (variable-time) +# +# ############################################################ + +func shlAddMod_multiprec_vartime( + a: var openArray[SecretWord], c: SecretWord, + M: openArray[SecretWord], mBits: int): SecretWord {.meter.} = + ## Fused modular left-shift + add + ## Computes: a <- a shl 2ʷ + c (mod M) + ## Returns: (a shl 2ʷ + c) / M + ## + ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms + ## + ## The modulus `M` most-significant bit at `mBits` MUST be set. + ## + ## Specialized for M being a multi-precision integer. + # Assuming 64-bit words + let hi = a[^1] # Save the high word to detect carries + let R = mBits and (WordBitWidth - 1) # R = mBits mod 64 + + var a0, a1, m0: SecretWord + if R == 0: # If the number of mBits is a multiple of 64 + a0 = a[^1] # + copyWords(a.view(), 1, # we can just shift words + a.view(), 0, a.len-1) # + a[0] = c # and replace the first one by c + a1 = a[^1] + m0 = M[^1] + else: # Else: need to deal with partial word shifts at the edge. + let clz = WordBitWidth-R + a0 = (a[^1] shl clz) or (a[^2] shr R) + copyWords(a.view(), 1, + a.view(), 0, a.len-1) + a[0] = c + a1 = (a[^1] shl clz) or (a[^2] shr R) + m0 = (M[^1] shl clz) or (M[^2] shr R) + + # m0 has its high bit set. (a0, a1)/m0 fits in a limb. + # Get a quotient q, at most we will be 2 iterations off + # from the true quotient + var q: SecretWord # Estimate quotient + if bool(a0 == m0): # if a_hi == divisor + q = MaxWord # quotient = MaxWord (0b1111...1111) + elif bool(a0.isZero()) and + bool(a1 < m0): # elif q == 0, true quotient = 0 + q = Zero + return q + else: + var r: SecretWord + div2n1n(q, r, a0, a1, m0) # else instead of being of by 0, 1 or 2 + q -= One # we return q-1 to be off by -1, 0 or 1 + + # Now substract a*2^64 - q*m + var carry = Zero + var overM = true # Track if quotient greater than the modulus + + for i in 0 ..< M.len: + var qm_lo: SecretWord + block: # q*m + # q * m + carry (doubleword) carry from previous limb + muladd1(carry, qm_lo, q, M[i], carry) + + block: # a*2^64 - q*m + var borrow: Borrow + subB(borrow, a[i], a[i], qm_lo, Borrow(0)) + carry += SecretWord(borrow) # Adjust if borrow + + if bool(a[i] != M[i]): + overM = bool(a[i] > M[i]) + + # Fix quotient, the true quotient is either q-1, q or q+1 + # + # if carry < q or carry == q and overM we must do "a -= M" + # if carry > hi (negative result) we must do "a += M" + if bool(carry > hi): + var c = Carry(0) + for i in 0 ..< a.len: + addC(c, a[i], a[i], M[i], c) + q -= One + elif overM or bool(carry < hi): + var b = Borrow(0) + for i in 0 ..< a.len: + subB(b, a[i], a[i], M[i], b) + q += One + + return q + +func shlAddMod_vartime(a: var openArray[SecretWord], c: SecretWord, + M: openArray[SecretWord], mBits: int): SecretWord {.meter.} = + ## Fused modular left-shift + add + ## Computes: a <- a shl 2ʷ + c (mod M) + ## Returns: (a shl 2ʷ + c) / M + ## + ## with w the base word width, usually 32 on 32-bit platforms and 64 on 64-bit platforms + ## + ## The modulus `M` most-significant bit at `mBits` MUST be set. + if mBits <= WordBitWidth: + # If M fits in a single limb + + # We normalize M with clz so that the MSB is set + # And normalize (a * 2^64 + c) by R as well to maintain the result + # This ensures that (a0, a1)/m0 fits in a limb. + let R = mBits and (WordBitWidth - 1) + + # (hi, lo) = a * 2^64 + c + if R == 0: + # We can delegate this R == 0 case to the + # shlAddMod_multiprec, with the same result. + # But isn't it faster to handle it here? + var q, r: SecretWord + div2n1n_vartime(q, r, a[0], c, M[0]) + a[0] = r + return q + else: + let clz = WordBitWidth-R + let hi = (a[0] shl clz) or (c shr R) + let lo = c shl clz + let m0 = M[0] shl clz + + var q, r: SecretWord + div2n1n(q, r, hi, lo, m0) + a[0] = r shr clz + return q + else: + return shlAddMod_multiprec_vartime(a, c, M, mBits) + +func divRem_vartime*( + q, r: var openArray[SecretWord], + a, b: openArray[SecretWord]): bool {.meter.} = + # q = a div b + # r = a mod b + # + # Requirements: + # b != 0 + # q.len > a.len - b.len + # r.len >= b.len + + let aBits = getBits_LE_vartime(a) + let bBits = getBits_LE_vartime(b) + let aLen = wordsRequired(aBits) + let bLen = wordsRequired(bBits) + let rLen = bLen + + let aOffset = a.len - b.len + + # Note: don't confuse a.len and aLen (actually used words) + + if unlikely(bool(r.len < bLen)): + # remainder buffer cannot store up to modulus size + return false + + if unlikely(bool(q.len < aOffset+1)): + # quotient buffer cannot store up to quotient size + return false + + if unlikely(bBits == 0): + # Divide by zero + return false + + if aBits < bBits: + # if a uses less bits than b, + # a < b, so q = 0 and r = a + copyWords(r.view(), 0, a.view(), 0, aLen) + for i in aLen ..< r.len: + r[i] = Zero + for i in 0 ..< q.len: + q[i] = Zero + else: + # The length of a is at least the divisor + # We can copy bLen-1 words + # and modular shift-left-add the rest + + copyWords(r.view(), 0, a.view(), aOffset+1, b.len-1) + r[rLen-1] = Zero + # Now shift-left the copied words while adding the new word mod b + + for i in countdown(aOffset, 0): + q[i] = shlAddMod_multiprec_vartime( + r.toOpenArray(0, rLen-1), + a[i], + b.toOpenArray(0, bLen-1), + bBits) + + # Clean up extra words + for i in aOffset+1 ..< q.len: + q[i] = Zero + for i in rLen ..< r.len: + r[i] = Zero + + return true + +func reduce_vartime*(r: var openArray[SecretWord], + a, b: openArray[SecretWord]): bool {.noInline, meter.} = + let aBits = getBits_LE_vartime(a) + let bBits = getBits_LE_vartime(b) + let aLen = wordsRequired(aBits) + let bLen = wordsRequired(bBits) + + let aOffset = a.len - b.len + var qBuf = allocHeapArray(SecretWord, aOffset+1) + template q: untyped = qBuf.toOpenArray(0, aOffset) + result = divRem_vartime(q, r, a, b) + freeHeap(qBuf) + +# ############################################################ +# +# Barrett Reduction +# +# ############################################################ + +# - https://en.wikipedia.org/wiki/Barrett_reduction +# - Handbook of Applied Cryptography +# Alfred J. Menezes, Paul C. van Oorschot and Scott A. Vanstone +# https://cacr.uwaterloo.ca/hac/about/chap14.pdf +# - Modern Computer Arithmetic +# Richard P. Brent and Paul Zimmermann +# https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf diff --git a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim index 8985468e..939d5419 100644 --- a/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim +++ b/constantine/math_arbitrary_precision/arithmetic/limbs_montgomery.nim @@ -12,7 +12,7 @@ import ./limbs_views, ./limbs_mod, ./limbs_fixedprec, - ./limbs_division + ./limbs_divmod_vartime # No exceptions allowed {.push raises: [], checks: off.} @@ -72,13 +72,13 @@ func oneMont_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {. # r.r_powmod_vartime(M, 1) - let mBits = getBits_LE_vartime(M) - let t = allocStackArray(SecretWord, M.len + 1) zeroMem(t, M.len*sizeof(SecretWord)) t[M.len] = One + let mBits = getBits_LE_vartime(M) r.view().reduce(LimbsViewMut t, M.len*WordBitWidth+1, M.view(), mBits) + # discard r.reduce_vartime(t.toOpenArray(0, M.len), M) func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter.} = ## Returns the Montgomery domain magic constant for the input modulus: @@ -90,14 +90,13 @@ func r2_vartime*(r: var openArray[SecretWord], M: openArray[SecretWord]) {.meter # r.r_powmod_vartime(M, 2) - let mBits = getBits_LE_vartime(M) - let t = allocStackArray(SecretWord, 2*M.len + 1) zeroMem(t, 2*M.len*sizeof(SecretWord)) t[2*M.len] = One - r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M.view(), mBits) - + let mBits = getBits_LE_vartime(M) + r.view().reduce(LimbsViewMut t, 2*M.len*WordBitWidth+1, M, mBits) + # discard r.reduce_vartime(t.toOpenArray(0, 2*M.len), M) # Montgomery multiplication # ------------------------------------------ diff --git a/constantine/platforms/intrinsics/extended_precision_vartime.nim b/constantine/platforms/intrinsics/extended_precision_vartime.nim new file mode 100644 index 00000000..7f6b8300 --- /dev/null +++ b/constantine/platforms/intrinsics/extended_precision_vartime.nim @@ -0,0 +1,158 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Variable-time extended-precision +# compiler intrinsics +# +# ############################################################ + +import ../abstractions + +func div2n1n_nim_vartime[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) {.tags:[VarTime].}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + + # doAssert leadingZeros(d) == 0, "Divisor was not normalized" + + const + size = sizeof(q) * 8 + halfSize = size div 2 + halfMask = (1.T shl halfSize) - 1.T + + func halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q, r: T] {.nimcall.} = + + var (q, r) = (n_hi div d_hi, n_hi mod d_hi) + let m = q * d_lo + r = (r shl halfSize) or n_lo + + # Fix the reminder, we're at most 2 iterations off + if r < m: + dec q + r += d + if r >= d and r < m: + dec q + r += d + r -= m + (q, r) + + let + d_hi = d shr halfSize + d_lo = d and halfMask + n_lohi = n_lo shr halfSize + n_lolo = n_lo and halfMask + + # First half of the quotient + let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo) + + # Second half + let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo) + + q = (q1 shl halfSize) or q2 + r = r2 + + +when UseASM_X86_64: + func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline, tags:[VarTime].}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + + # DIV r/m64 + # Divide RDX:RAX (n_hi:n_lo) by r/m64 + # + # Inputs + # - numerator high word in RDX, + # - numerator low word in RAX, + # - divisor as r/m parameter (register or memory at the compiler discretion) + # Result + # - Quotient in RAX + # - Remainder in RDX + + # 1. name the register/memory "divisor" + # 2. don't forget to dereference the var hidden pointer + # 3. - + # 4. no clobbered registers beside explicitly used RAX and RDX + when defined(cpp): + asm """ + div %[divisor] + :"=rax" (`q`), "=rdx" (`r`) + :"rdx" (`n_hi`), "rax" (`n_lo`), [divisor] "rm" (`d`) + """ + else: + asm """ + div %[divisor] + :"=rax" (*`q`), "=rdx" (*`r`) + :"rdx" (`n_hi`), "rax" (`n_lo`), [divisor] "rm" (`d`) + """ + +elif sizeof(int) == 8 and defined(vcc): + func udiv128_vartime(highDividend, lowDividend, divisor: uint64, remainder: var uint64): uint64 {.importc:"_udiv128", header: "", nodecl, tags:[VarTime].} + ## Division 128 by 64, Microsoft only, 64-bit only, + ## returns quotient as return value remainder as var parameter + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + + func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + q = udiv128_vartime(n_hi, n_lo, d, r) + +elif sizeof(int) == 8 and GCC_Compatible: + type + uint128{.importc: "unsigned __int128".} = object + + const + newerNim = (NimMajor, NimMinor) > (1, 6) + noExplicitPtrDeref = defined(cpp) or newerNim + + func div2n1n_128_vartime(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline, tags:[VarTime].}= + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms + ## - if n_hi > d result is undefined + var dblPrec {.noinit.}: uint128 + {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].} + + # Don't forget to dereference the var param in C mode + when noExplicitPtrDeref: + {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].} + else: + {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].} + {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].} + + +func div2n1n_vartime*(q, r: var SecretWord, n_hi, n_lo, d: SecretWord) {.inline.} = + ## Division uint128 by uint64 + ## Warning ⚠️ : + ## - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE + ## - if n_hi > d result is undefined + ## + ## To avoid issues, n_hi, n_lo, d should be normalized. + ## i.e. shifted (== multiplied by the same power of 2) + ## so that the most significant bit in d is set. + when sizeof(int) == 4: + let dividend = (uint64(n_hi) shl 32) or uint64(n_lo) + let divisor = uint64(d) + q = uint32(dividend div divisor) + r = uint32(dividend mod divisor) + when nimvm: + div2n1n_nim_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d) + else: + when declared(div2n1n_128_vartime): + div2n1n_128_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d) + else: + div2n1n_nim_vartime(BaseType q, BaseType r, BaseType n_hi, BaseType n_lo, BaseType d) diff --git a/tests/math_arbitrary_precision/t_bigints_mod.nim b/tests/math_arbitrary_precision/t_bigints_mod.nim index a7ca91c8..b732dcce 100644 --- a/tests/math_arbitrary_precision/t_bigints_mod.nim +++ b/tests/math_arbitrary_precision/t_bigints_mod.nim @@ -3,14 +3,16 @@ import ../../constantine/math/[ arithmetic, - io/io_bigints] + io/io_bigints], + ../../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime let a = BigInt[64].fromUint(0xa0e5cb56a1c08396'u64) let M = BigInt[64].fromUint(0xae57180eceb0206f'u64) -var r: BigInt[64] +var r, r2: BigInt[64] r.reduce(a, M) +doAssert r2.limbs.reduce_vartime(a.limbs, M.limbs) let rU64 = 0xa0e5cb56a1c08396'u64 mod 0xae57180eceb0206f'u64 echo r.toHex() diff --git a/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim b/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim index aada368a..ef41b7ef 100644 --- a/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim +++ b/tests/math_arbitrary_precision/t_bigints_mod_vs_gmp.nim @@ -15,6 +15,7 @@ import ../../constantine/math/[arithmetic, io/io_bigints], ../../constantine/platforms/primitives, ../../constantine/serialization/codecs, + ../../constantine/math_arbitrary_precision/arithmetic/limbs_divmod_vartime, # Test utilities ../../helpers/prng_unsafe @@ -126,8 +127,9 @@ proc main() = # Modulus mpz_mod(r, a, m) - var rTest: BigInt[mBits] + var rTest, rTest_vartime: BigInt[mBits] rTest.reduce(aTest, mTest) + doAssert rTest_vartime.limbs.reduce_vartime(aTest.limbs, mTest.limbs) ######################################################### # Check @@ -139,8 +141,9 @@ proc main() = var rGMP: array[mLen, byte] discard mpz_export(rGMP[0].addr, rW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, r) - var rConstantine: array[mLen, byte] + var rConstantine, rCttVartime: array[mLen, byte] marshal(rConstantine, rTest, bigEndian) + marshal(rCttVartime, rTest_vartime, bigEndian) # echo "rGMP: ", rGMP.toHex() # echo "rConstantine: ", rConstantine.toHex() @@ -158,4 +161,16 @@ proc main() = " Constantine: " & rConstantine.toHex() & "\n" & "(Note that GMP aligns bytes left while constantine aligns bytes right)" + doAssert rGMP.toOpenArray(0, rW-1) == rCttVartime.toOpenArray(mLen-rW, mLen-1), block: + # Reexport as bigEndian for debugging + discard mpz_export(aBuf[0].addr, aW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, a) + discard mpz_export(mBuf[0].addr, mW.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, m) + "\nModulus with operands\n" & + " a (" & align($aBits, 4) & "-bit): " & aBuf.toHex & "\n" & + " m (" & align($mBits, 4) & "-bit): " & mBuf.toHex & "\n" & + "failed:" & "\n" & + " GMP: " & rGMP.toHex() & "\n" & + " Constantine: " & rCttVartime.toHex() & "\n" & + "(Note that GMP aligns bytes left while constantine aligns bytes right)" + main() diff --git a/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim b/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim index d0465bc9..98c31aed 100644 --- a/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim +++ b/tests/math_arbitrary_precision/t_bigints_powmod_vs_gmp.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ../../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_division], + ../../constantine/math_arbitrary_precision/arithmetic/[bigints_views, limbs_views, limbs_divmod], ../../constantine/platforms/abstractions, ../../helpers/prng_unsafe,