diff --git a/README.md b/README.md index f0b6e85f..0ab891cd 100644 --- a/README.md +++ b/README.md @@ -100,9 +100,9 @@ Note: some goals might be mutually exclusive, for example "plausible deniability - The bindings are put in `constantine/lib` - The headers are in [constantine/include](./include) for example [Ethereum BLS signatures](./include/constantine_ethereum_bls_signatures.h) -6. Read the examples in [examples_c](./examples_c): - - Using the [Ethereum BLS signatures bindings from C](./examples_c/ethereum_bls_signatures.c) - - Testing Constantine BLS12-381 vs GMP [./examples_c/t_libctt_bls12_381.c](./examples_c/t_libctt_bls12_381.c) +6. Read the examples in [examples-c](./examples-c): + - Using the [Ethereum BLS signatures bindings from C](./examples-c/ethereum_bls_signatures.c) + - Testing Constantine BLS12-381 vs GMP [./examples-c/t_libctt_bls12_381.c](./examples-c/t_libctt_bls12_381.c) The bindings currently provided are: diff --git a/constantine.nimble b/constantine.nimble index 9c933af8..6621d8f0 100644 --- a/constantine.nimble +++ b/constantine.nimble @@ -190,7 +190,7 @@ proc releaseBuildOptions(buildMode = bmBinary): string = # However functions that uses a large stack like `sum_reduce_vartime` become incorrect. # Hence deactivated by default. else: "" - + let threadLocalStorage = " --tlsEmulation=off " compiler & @@ -294,15 +294,15 @@ proc testLib(path, testName: string, useGMP: bool) = echo &"\n[Test: {path}/{testName}.c] Testing static library: {staticlibName}" # Beware MacOS annoying linker with regards to static libraries # The following standard way cannot be used on MacOS - # exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples_c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine -Wl,-Bdynamic" + # exec "gcc -Iinclude -Llib -o build/t_libctt_bls12_381_sl.exe examples-c/t_libctt_bls12_381.c -lgmp -Wl,-Bstatic -lconstantine -Wl,-Bdynamic" exec &"{cc} -Iinclude -o build/test_lib/{testName}_staticlink.exe {path}/{testName}.c lib/{staticlibName} " & (if useGMP: "-lgmp" else: "") exec &"./build/test_lib/{testName}_staticlink.exe" echo "" task test_lib, "Test C library": exec "mkdir -p build/test_lib" - testLib("examples_c", "t_libctt_bls12_381", useGMP = true) - testLib("examples_c", "ethereum_bls_signatures", useGMP = false) + testLib("examples-c", "t_libctt_bls12_381", useGMP = true) + testLib("examples-c", "ethereum_bls_signatures", useGMP = false) testLib("tests"/"c_api", "t_threadpool", useGMP = false) # Test config @@ -551,10 +551,10 @@ const testDescNvidia: seq[string] = @[ ] const testDescThreadpool: seq[string] = @[ - "examples_threadpool/e01_simple_tasks.nim", - "examples_threadpool/e02_parallel_pi.nim", - "examples_threadpool/e03_parallel_for.nim", - "examples_threadpool/e04_parallel_reduce.nim", + "examples-threadpool/e01_simple_tasks.nim", + "examples-threadpool/e02_parallel_pi.nim", + "examples-threadpool/e03_parallel_for.nim", + "examples-threadpool/e04_parallel_reduce.nim", # "benchmarks-threadpool/bouncing_producer_consumer/threadpool_bpc.nim", # Need timing not implemented on Windows "benchmarks-threadpool/dfs/threadpool_dfs.nim", "benchmarks-threadpool/fibonacci/threadpool_fib.nim", diff --git a/constantine/threadpool/README.md b/constantine/threadpool/README.md index ecedf884..e749562f 100644 --- a/constantine/threadpool/README.md +++ b/constantine/threadpool/README.md @@ -35,4 +35,4 @@ Compared to [nim-taskpools](https://github.com/status-im), here are the tradeoff - Powersaving improvement, threads sleep when awaiting for a task and there is no work available. - Scheduling improvement, Constantine's threadpool incorporate Weave's adaptative scheduling policy with additional enhancement (leapfrogging) -See also [design.md](../../docs-threadpool/design.md) \ No newline at end of file +See also [design.md](../../docs/threadpool-design.md) \ No newline at end of file diff --git a/docs/implementation_nvidia_gpus.md b/docs/crypto-nvidia_gpus.md similarity index 100% rename from docs/implementation_nvidia_gpus.md rename to docs/crypto-nvidia_gpus.md diff --git a/docs/optimizations.md b/docs/crypto-optimizations.md similarity index 100% rename from docs/optimizations.md rename to docs/crypto-optimizations.md diff --git a/docs-threadpool/design.md b/docs/threadpool-design.md similarity index 100% rename from docs-threadpool/design.md rename to docs/threadpool-design.md diff --git a/docs-threadpool/partitioners.md b/docs/threadpool-partitioners.md similarity index 100% rename from docs-threadpool/partitioners.md rename to docs/threadpool-partitioners.md diff --git a/docs-threadpool/random_permutations.md b/docs/threadpool-random_permutations.md similarity index 100% rename from docs-threadpool/random_permutations.md rename to docs/threadpool-random_permutations.md diff --git a/docs/zk_accel_layer.md b/docs/zk_accel_layer.md new file mode 100644 index 00000000..75048101 --- /dev/null +++ b/docs/zk_accel_layer.md @@ -0,0 +1,235 @@ +# ZK Accel layer + +_Last update: 2023-12-04_ + +This document is aimed at software and hardware proof system accelerators providers. + +It documents the steps taken in Constantine to support the candidate ZAL (Zk Accel API) +proposed in https://github.com/privacy-scaling-explorations/halo2/issues/216 +"[RFC] Blackboxing MSM and FFT - Hardware Accel API". + +> ``📝`` Note +> +> Constantine is written in the [Nim programming language](https://nim-lang.org/).\ +> Nim compiles to machine code through C hence code produced by Nim +> has the C ABI of the compiler/OS used. +> +> In short this guide is applicable for C or C++ libraries. + +## Testing + +Building Constantine with ZAL supports requires: + +- Clang +- LLVM +- Nim +- Rust + +All of those are available through your package manager on MacOS or Linux. + +Commands for testing: + +``` +git clone https://github.com/mratsim/constantine +cd constantine +cargo test +cargo bench +``` + +## API + +As of 2023-12-04, the candidate API is proposed in https://github.com/privacy-scaling-explorations/halo2curves/pull/107/files#diff-e746a6a49bd01b5c3241e440d309f5a5e38e583aa9a2eaa2c97419fdc1a3104aR42 + +```Rust +pub trait ZalEngine {} + +pub trait MsmAccel: ZalEngine { + fn msm(&self, coeffs: &[C::Scalar], base: &[C]) -> C::Curve; +} +``` + +A similar trait will likely be proposed for FFTs and coset FFTs + +Initialization of the ZalEngine is at the moment not enforced. +It is recommended that ZAL backends provide: +- an initialization function: + - either "fn new() -> ZalEngine" for simple libraries + - or a builder pattern for complex initializations +- a shutdown function wrapped in a Drop trait. + +The ZalEngine can be a stub type and the shutdown function might be unnecessary +if the ZalEngine uses a global threadpool like Rayon. + +Backends might want to add as an option: +- The number of threads (CPU) +- The device(s) to run on (multi-sockets machines, multi-GPUs machines, ...) +- The curve (JIT-compiled backend) +- ... + +## C Backend + +Wrapping the C code can be autogenerated from Rust bindgen +```sh +cargo install bindgen-cli +``` + +Constantine uses the following [scripts/gen_rust_bindings.sh](../scripts/gen_rust_bindings.sh) + +```sh +bindgen \ + include/constantine.h \ + -o constantine-rust/constantine-sys/src/bindings.rs \ + --default-enum-style rust \ + --use-core \ + --no-derive-debug \ + --default-visibility private \ + --enable-function-attribute-detection \ + -- -Iinclude +``` + +The headers for MSM are at [include/curves/bn254_snarks_parallel.h](../include/curves/bn254_snarks_parallel.h) + +```C +void ctt_bn254_snarks_g1_prj_multi_scalar_mul_fr_coefs_vartime_parallel(const ctt_threadpool* tp, bn254_snarks_g1_prj* r, const bn254_snarks_fr coefs[], const bn254_snarks_g1_aff points[], size_t len); +``` + +The Rust wrapping is in [constantine-rust/constantine-zal-halo2kzg/src/lib.rs](../constantine-rust/constantine-zal-halo2kzg/src/lib.rs) + +```Rust +use ::core::mem::MaybeUninit; +use constantine_sys::*; +use halo2curves::bn256; +use halo2curves::zal::{MsmAccel, ZalEngine}; +use std::mem; + +pub struct CttEngine { + ctx: *mut ctt_threadpool, +} + +impl CttEngine { + #[inline(always)] + pub fn new(num_threads: usize) -> CttEngine { + let ctx = unsafe { ctt_threadpool_new(num_threads) }; + CttEngine { ctx } + } +} + +impl Drop for CttEngine { + fn drop(&mut self) { + unsafe { ctt_threadpool_shutdown(self.ctx) } + } +} + +impl ZalEngine for CttEngine {} + +impl MsmAccel for CttEngine { + fn msm(&self, coeffs: &[bn256::Fr], bases: &[bn256::G1Affine]) -> bn256::G1 { + assert_eq!(coeffs.len(), bases.len()); + let mut result: MaybeUninit = MaybeUninit::uninit(); + unsafe { + ctt_bn254_snarks_g1_prj_multi_scalar_mul_fr_coefs_vartime_parallel( + self.ctx, + result.as_mut_ptr(), + coeffs.as_ptr() as *const bn254_snarks_fr, + bases.as_ptr() as *const bn254_snarks_g1_aff, + bases.len(), + ); + mem::transmute::, bn256::G1>(result) + } + } +} + +``` + +And testing + +```Rust +#[cfg(test)] +mod tests { + use super::*; + + use ark_std::{end_timer, start_timer}; + use rand_core::OsRng; + + use halo2curves::bn256; + use halo2curves::ff::Field; + use halo2curves::group::prime::PrimeCurveAffine; + use halo2curves::group::{Curve, Group}; + use halo2curves::msm::best_multiexp; + use halo2curves::zal::MsmAccel; + + #[test] + fn t_threadpool() { + let tp = CttEngine::new(4); + drop(tp); + } + + fn run_msm_zal(min_k: usize, max_k: usize) { + let points = (0..1 << max_k) + .map(|_| bn256::G1::random(OsRng)) + .collect::>(); + let mut affine_points = vec![bn256::G1Affine::identity(); 1 << max_k]; + bn256::G1::batch_normalize(&points[..], &mut affine_points[..]); + let points = affine_points; + + let scalars = (0..1 << max_k) + .map(|_| bn256::Fr::random(OsRng)) + .collect::>(); + + for k in min_k..=max_k { + let points = &points[..1 << k]; + let scalars = &scalars[..1 << k]; + + let t0 = start_timer!(|| format!("freestanding msm k={}", k)); + let e0 = best_multiexp(scalars, points); + end_timer!(t0); + + let engine = CttEngine::new(num_cpus::get()); + let t1 = start_timer!(|| format!("CttEngine msm k={}", k)); + let e1 = engine.msm(scalars, points); + end_timer!(t1); + + assert_eq!(e0, e1); + } + } + + #[test] + fn t_msm_zal() { + run_msm_zal(3, 14); + } +} +``` + +## ABI description + +- ZalEngine is an opaque pointer that you can use for anything (or nothing for the builtin H2cEngine) +- coeffs/scalars are: + - field elements in the range [0,r) in **Montgomery** domain, with `r` the prime **order** of the curve. + For A in Montgomery and a in canonical domain we have `A = aR (mod r)`, R being 2²⁵⁶ for BN254 + - halo2curves uses 64-bit words, word-endianness is machine-endian (i.e. little endian on x86, ARM, RISC-V) + - Big numbers are split into 64-bit limbs, least significant limb first, i.e. limb-endianness is little endian + - each takes 32 bytes +- points/bases are: + - group elements / short Weierstrass elliptic curve points, in **affine coordinates representation** (**NOT** jacobian) + Coordinates are ordered (x, y) + - each takes 64 bytes +- result is: + - a group element / short Weierstrass elliptic curve points, in **projective coordinates representation** (**NOT** jacobian) + + Converting from Jacobian to Projective is explained here [constantine/math/ec_shortweierstrass.nim](../constantine/math/ec_shortweierstrass.nim) + + For affine coordinates (x, y): + - Projective coordinates have the form (X, Y, Z) with x = X/Z and y = Y/Z + - Jacobian coordinates have the form (X, Y, Z) with X = X/Z² and y = Y/Z³ + + ```Nim + func projectiveFromJacobian*[F; G]( + prj: var ECP_ShortW_Prj[F, G], + jac: ECP_ShortW_Jac[F, G]) {.inline.} = + prj.x.prod(jac.x, jac.z) + prj.y = jac.y + prj.z.square(jac.z) + prj.z *= jac.z + ``` + Coordinates are ordered (X, Y, Z) + - Result requires 96 bytes \ No newline at end of file diff --git a/examples_c/README.md b/examples-c/README.md similarity index 100% rename from examples_c/README.md rename to examples-c/README.md diff --git a/examples_c/ethereum_bls_signatures.c b/examples-c/ethereum_bls_signatures.c similarity index 100% rename from examples_c/ethereum_bls_signatures.c rename to examples-c/ethereum_bls_signatures.c diff --git a/examples_c/t_libctt_bls12_381.c b/examples-c/t_libctt_bls12_381.c similarity index 100% rename from examples_c/t_libctt_bls12_381.c rename to examples-c/t_libctt_bls12_381.c diff --git a/examples_threadpool/e01_simple_tasks.nim b/examples-threadpool/e01_simple_tasks.nim similarity index 100% rename from examples_threadpool/e01_simple_tasks.nim rename to examples-threadpool/e01_simple_tasks.nim diff --git a/examples_threadpool/e02_parallel_pi.nim b/examples-threadpool/e02_parallel_pi.nim similarity index 100% rename from examples_threadpool/e02_parallel_pi.nim rename to examples-threadpool/e02_parallel_pi.nim diff --git a/examples_threadpool/e03_parallel_for.nim b/examples-threadpool/e03_parallel_for.nim similarity index 100% rename from examples_threadpool/e03_parallel_for.nim rename to examples-threadpool/e03_parallel_for.nim diff --git a/examples_threadpool/e04_parallel_reduce.nim b/examples-threadpool/e04_parallel_reduce.nim similarity index 100% rename from examples_threadpool/e04_parallel_reduce.nim rename to examples-threadpool/e04_parallel_reduce.nim diff --git a/examples_threadpool/raytracing/README.md b/examples-threadpool/raytracing/README.md similarity index 100% rename from examples_threadpool/raytracing/README.md rename to examples-threadpool/raytracing/README.md diff --git a/examples_threadpool/raytracing/ray_trace_300samples_nim_nested.png b/examples-threadpool/raytracing/ray_trace_300samples_nim_nested.png similarity index 100% rename from examples_threadpool/raytracing/ray_trace_300samples_nim_nested.png rename to examples-threadpool/raytracing/ray_trace_300samples_nim_nested.png diff --git a/examples_threadpool/raytracing/ray_trace_300samples_nim_threaded.png b/examples-threadpool/raytracing/ray_trace_300samples_nim_threaded.png similarity index 100% rename from examples_threadpool/raytracing/ray_trace_300samples_nim_threaded.png rename to examples-threadpool/raytracing/ray_trace_300samples_nim_threaded.png diff --git a/examples_threadpool/raytracing/smallpt.cpp b/examples-threadpool/raytracing/smallpt.cpp similarity index 100% rename from examples_threadpool/raytracing/smallpt.cpp rename to examples-threadpool/raytracing/smallpt.cpp diff --git a/examples_threadpool/raytracing/smallpt.nim b/examples-threadpool/raytracing/smallpt.nim similarity index 100% rename from examples_threadpool/raytracing/smallpt.nim rename to examples-threadpool/raytracing/smallpt.nim