From 018c3e9a15fcafd7c5de7fb5efade5102d3ace33 Mon Sep 17 00:00:00 2001 From: Jonas Malaco Date: Thu, 16 Jan 2025 06:35:21 -0300 Subject: [PATCH] argon2: allocate blocks as a single chunk of bytes While investigating the scaling performance of the parallel implementation, I noticed a substantial chunk of time taken on block allocation in `hash_password_into`. The issue lies in `vec![Block::default; ...]`, which clones the supplied block. This happens because the standard library lacks a suitable specialization that can be used with `Block` (or, for that matter, `[u64; 128]`). Therefore, let's instead allocate a big bag of bytes and then transmute it, or more precisely a mutable slice into it, to produce the slice of blocks to pass into `hash_password_into_with_memory`. One point to pay attention to is that `Blocks` currently specifies 64-byte alignment, while a byte slice has alignment of 1. Luckily, `slice::align_to_mut` is particularly well suited for this. It is also cleaner and less error prone than other unsafe alternatives I tried (a couple of them using `MaybeUninit`). This patch passes Miri on: reference_argon2i_v0x13_2_8_2 reference_argon2id_v0x13_2_8_2 And the performance gains are considerable: argon2id V0x13 m=2048 t=8 p=4 time: [3.3493 ms 3.3585 ms 3.3686 ms] change: [-6.1577% -5.7842% -5.4067%] (p = 0.00 < 0.05) Performance has improved. argon2id V0x13 m=32768 t=4 p=4 time: [24.106 ms 24.253 ms 24.401 ms] change: [-9.8553% -8.9089% -7.9745%] (p = 0.00 < 0.05) Performance has improved. argon2id V0x13 m=1048576 t=1 p=4 time: [181.68 ms 182.96 ms 184.35 ms] change: [-28.165% -27.506% -26.896%] (p = 0.00 < 0.05) Performance has improved. (For the users that don't allocate the blocks themselves). --- argon2/src/lib.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index dfb9984e..b7239fee 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -239,7 +239,13 @@ impl<'key> Argon2<'key> { #[cfg(feature = "alloc")] #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))] pub fn hash_password_into(&self, pwd: &[u8], salt: &[u8], out: &mut [u8]) -> Result<()> { - let mut blocks = vec![Block::default(); self.params.block_count()]; + // For moderate and large `n`, `vec![Block::new(); n]` is significantly slower than + // creating a `Vec` of bytes. But as byte slices have alignment of 1, use a vec one `Block` + // alignment too large to ensure that we can get enough aligned blocks out of it. + let size = self.params.block_count() * size_of::() + align_of::(); + let mut bytes = vec![0u8; size]; + // SAFETY: all-zeros is a valid bit pattern for `Block`. + let (_, mut blocks, _) = unsafe { bytes.align_to_mut::() }; self.hash_password_into_with_memory(pwd, salt, out, &mut blocks) }