diff --git a/benchmarks/bench_gt_prj.nim b/benchmarks/bench_gt_prj.nim index b802e00e..6f2253ca 100644 --- a/benchmarks/bench_gt_prj.nim +++ b/benchmarks/bench_gt_prj.nim @@ -17,6 +17,7 @@ import ./bench_blueprint const Iters = 100_000 +const BatchIters = 1_000 const AvailableCurves = [ BLS12_381, # BN254_Snarks @@ -125,6 +126,43 @@ proc gtFromTorus(C: static Algebra, iters: int) = bench("𝔾ₜ <- T₂(𝔽p6) conversion", Quad[Fp6[C]], iters): r.fromTorus2_vartime(t) +proc torusFromGtMultiNaive(C: static Algebra, batchSize, iters: int) = + var r = newSeq[T2Aff[Fp6[C]]](batchSize) + var xx = newSeq[Quad[Fp6[C]]](batchSize) + for x in xx.mitems(): + x = rng.random_gt(Quad[Fp6[C]]) + bench("T₂(𝔽p6) <- 𝔾ₜ multi-conversion naive - " & $batchSize, Quad[Fp6[C]], iters): + for i in 0 ..< batchSize: + r[i].fromGT_vartime(xx[i]) + +proc torusFromGtMultiBatch(C: static Algebra, batchSize, iters: int) = + var r = newSeq[T2Aff[Fp6[C]]](batchSize) + var xx = newSeq[Quad[Fp6[C]]](batchSize) + for x in xx.mitems(): + x = rng.random_gt(Quad[Fp6[C]]) + bench("T₂(𝔽p6) <- 𝔾ₜ multi-conversion batched - " & $batchSize, Quad[Fp6[C]], iters): + r.batchFromGT_vartime(xx) + +proc gtFromTorus2MultiNaive(C: static Algebra, batchSize, iters: int) = + var tt = newSeq[T2Prj[Fp6[C]]](batchSize) + var aa = newSeq[Quad[Fp6[C]]](batchSize) + for a in aa.mitems(): + a = rng.random_gt(Quad[Fp6[C]]) + for i in 0 ..< batchSize: + tt[i].fromGT_vartime(aa[i]) + bench("𝔾ₜ <- T₂(𝔽p6) multi-conversion naive - " & $batchSize, Quad[Fp6[C]], iters): + aa.batchfromTorus2_vartime(tt) + +proc gtFromTorus2MultiBatch(C: static Algebra, batchSize, iters: int) = + var tt = newSeq[T2Aff[Fp6[C]]](batchSize) + var aa = newSeq[Quad[Fp6[C]]](batchSize) + for a in aa.mitems(): + a = rng.random_gt(Quad[Fp6[C]]) + tt.batchFromGT_vartime(aa) + bench("𝔾ₜ <- T₂(𝔽p6) multi-conversion batched - " & $batchSize, Quad[Fp6[C]], iters): + for i in 0 ..< batchSize: + aa[i].fromTorus2_vartime(tt[i]) + proc mulT2_aff(C: static Algebra, iters: int) = let a = rng.random_gt(Quad[Fp6[C]]) let b = rng.random_gt(Quad[Fp6[C]]) @@ -204,6 +242,12 @@ proc main() = separator() torusFromGt(curve, Iters) gtFromTorus(curve, Iters) + separator() + torusFromGtMultiNaive(curve, batchSize = 256, BatchIters) + torusFromGtMultiBatch(curve, batchSize = 256, BatchIters) + gtFromTorus2MultiNaive(curve, batchSize = 256, BatchIters) + gtFromTorus2MultiBatch(curve, batchSize = 256, BatchIters) + separator() mulT2_aff(curve, Iters) mulT2_mix(curve, Iters) mulT2_prj(curve, Iters) diff --git a/constantine/math/pairings/gt_prj.nim b/constantine/math/pairings/gt_prj.nim index 66bbaf38..83d59055 100644 --- a/constantine/math/pairings/gt_prj.nim +++ b/constantine/math/pairings/gt_prj.nim @@ -361,3 +361,88 @@ proc inv*[F](r: var T2Prj[F], a: T2Prj[F]) {.inline.} = # Cyclotomic inversion on a Torus r.x.neg(a.x) r.z = a.z + +# Batched conversions +# ------------------- + +proc batchFromGT_vartime*[F](dst: var openArray[T2Aff[F]], + src: openArray[QuadraticExt[F]]) = + ## Batch conversion to Torus + ## + ## This requires all `src` to be different from 0. + ## This is always true for elements in 𝔾ₜ. + ## + ## This replaces all inversions but one (on 𝔽p6 for 𝔾ₜ in 𝔽p12) + ## by 3 multiplications. + ## + ## Note: on 𝔽p6, the ratio of inversion I/M is about 3.8 + ## so this is about a ~25% speedup + + debug: doAssert dst.len == src.len + + F(dst[0]) = src[0].c1 + for i in 1 ..< dst.len: + F(dst[i]).prod(F dst[i-1], src[i].c1) + + var accInv {.noInit.}: F + accInv.inv_vartime(F dst[dst.len-1]) + + for i in countdown(dst.len-1, 1): + # Compute inverse + F(dst[i]).prod(accInv, F dst[i-1]) + # Next iteration + accInv *= src[i].c1 + + F(dst[0]) = accInv + + var minusOne {.noInit.}: F + minusOne.setMinusOne() + + for i in 0 ..< dst.len: + var t {.noInit.}: F + t.diff(minusOne, src[i].c0) + F(dst[i]) *= t + +proc batchFromTorus2_vartime*[F](dst: var openArray[QuadraticExt[F]], + src: openArray[T2Prj[F]]) = + ## Batch conversion to 𝔾ₜ + ## + ## This requires all `src` to be different from 0. + ## This is always true for elements in 𝔾ₜ. + ## + ## This replaces all inversions but one (on 𝔽p12 for 𝔾ₜ in 𝔽p12) + ## by 3 multiplications. + ## + ## Note: on 𝔽p12, the ratio of inversion I/M is about 3 + ## so this has likely no speedup, and is not trivial to parallelize + debug: doAssert dst.len == src.len + + # We consciously choose to recompute conj(src[i]) to avoid an allocation + # On BLS12-381, src[i] elements are 12*48 bytes = 576 bytes + type QF = QuadraticExt[F] + + dst[0].conj(QF src[0]) + for i in 1 ..< dst.len: + var ti {.noInit.}: QF + ti.conj(QF src[i]) + dst[i].prod(dst[i-1], ti) + + var accInv{.noInit.}: QF + accInv.inv(dst[dst.len-1]) + + for i in countdown(dst.len-1, 1): + # Compute inverse + dst[i].prod(accInv, dst[i-1]) + # Conjugate it + dst[i].conj() + # Next iteration + var ti {.noInit.}: QF + ti.conj(QF src[i]) + accInv *= ti + # Finalize conversion + dst[i] *= ti + + dst[0].conj(accInv) + var t {.noInit.}: QF + t.conj(QF src[0]) + dst[0] *= t diff --git a/tests/math_pairings/t_gt_prj.nim b/tests/math_pairings/t_gt_prj.nim index 453de974..46ae93cf 100644 --- a/tests/math_pairings/t_gt_prj.nim +++ b/tests/math_pairings/t_gt_prj.nim @@ -32,6 +32,8 @@ echo "𝔾ₜ projective", " xoshiro512** seed: ", seed const Fp6iters = 10 +const BatchIters = 1 +const BatchSize = 256 suite "𝔽p6 projective over 𝔽p2": test "Select check from Magma": @@ -292,3 +294,63 @@ suite "Torus-based Cryptography for 𝔾ₜ, T₂(𝔽p6) compression": test(BN254_Nogami) # test(BN254_Snarks) test(BLS12_381) + + # ==================================================================================== + + test "Batch conversion: T₂(𝔽p6) <- 𝔾ₜ": + proc test(Name: static Algebra) = + for i in 0 ..< BatchIters: + type F6 = Fp6[Name] + type MyFp12 = QuadraticExt[F6] # Even if we choose to Fp2 -> Fp4 -> Fp12 + # we want this test to pass + + var aa = newSeq[MyFp12](BatchSize) + for a in aa.mitems(): + a = rng.random_gt(MyFp12) + + var r_batch = newSeq[T2Aff[F6]](BatchSize) + var r_expected = newSeq[T2Aff[F6]](BatchSize) + + for i in 0 ..< BatchSize: + r_expected[i].fromGT_vartime(aa[i]) + + r_batch.batchFromGT_vartime(aa) + + for i in 0 ..< BatchSize: + doAssert bool(F6(r_batch[i]) == F6(r_expected[i])), block: + "\niteration " & $i & ":\n" & + " found: " & F6(r_batch[i]).toHex(indent = 12) & "\n" & + " expected: " & F6(r_expected[i]).toHex(indent = 12) & "\n" + + test(BN254_Nogami) + # test(BN254_Snarks) + test(BLS12_381) + + test "Batch conversion: 𝔾ₜ <- T₂(𝔽p6)": + proc test(Name: static Algebra) = + for i in 0 ..< BatchIters: + type F6 = Fp6[Name] + type MyFp12 = QuadraticExt[F6] # Even if we choose to Fp2 -> Fp4 -> Fp12 + # we want this test to pass + + var aa = newSeq[MyFp12](BatchSize) + for a in aa.mitems(): + a = rng.random_gt(MyFp12) + + var t2s = newSeq[T2Prj[F6]](BatchSize) + + for i in 0 ..< BatchSize: + t2s[i].fromGT_vartime(aa[i]) + + var aa_batch = newSeq[MyFp12](BatchSize) + aa_batch.batchFromTorus2_vartime(t2s) + + for i in 0 ..< BatchSize: + doAssert bool(aa[i] == aa_batch[i]), block: + "\niteration " & $i & ":\n" & + " found: " & aa_batch[i].toHex(indent = 12) & "\n" & + " expected: " & aa[i].toHex(indent = 12) & "\n" + + test(BN254_Nogami) + # test(BN254_Snarks) + test(BLS12_381)