From 13b819b49be155117fd4430478e48adb4d5f1db1 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Thu, 15 Aug 2024 23:05:20 +0200 Subject: [PATCH 01/15] Resolved issue for high precision MLE estimation --- src/core/src/sketch/hyperloglog/estimators.rs | 152 +++++++++++++----- src/core/src/sketch/hyperloglog/mod.rs | 77 ++++++++- 2 files changed, 189 insertions(+), 40 deletions(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 9a2d7994ef..7d36461ae0 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -1,38 +1,91 @@ -use std::cmp; +use core::{ + cmp, + ops::{Add, AddAssign, Shl, Sub, SubAssign}, +}; pub type CounterType = u8; -pub fn counts(registers: &[CounterType], q: usize) -> Vec { - let mut counts = vec![0; q + 2]; +/// Trait for types that can be used as multiplicity integers. +pub trait MulteplicityInteger: + Shl + + Copy + + AddAssign + + SubAssign + + Eq + + Sub + + Add + + TryFrom + + Ord +{ + /// The zero value. + const ZERO: Self; + /// The one value. + const ONE: Self; + + /// Convert the value to a `f64`. + fn to_f64(self) -> f64; +} + +macro_rules! impl_multeplicity_integer { + ($($t:ty),*) => { + $( + impl MulteplicityInteger for $t { + const ONE: Self = 1; + const ZERO: Self = 0; + + fn to_f64(self) -> f64 { + self as f64 + } + } + )* + }; +} + +impl_multeplicity_integer!(u8, u16, u32); + +pub fn counts(registers: &[CounterType], q: usize) -> Vec { + let mut counts = vec![M::ZERO; q + 2]; for k in registers { - counts[*k as usize] += 1; + counts[*k as usize] += M::ONE; } counts } #[allow(clippy::many_single_char_names)] -pub fn mle(counts: &[u16], p: usize, q: usize, relerr: f64) -> f64 { - let m = 1 << p; +pub fn mle(counts: &[M], p: usize, q: usize, relerr: f64) -> f64 { + let m: M = M::ONE << p; + + // If all of the registers are equal to zero, then we return zero. + if counts[0] == m { + return 0.0; + } + + // If all of the registers are equal to the maximal possible value + // that a register may have, then we return infinity. if counts[q + 1] == m { return f64::INFINITY; } - let (k_min, _) = counts.iter().enumerate().find(|(_, v)| **v != 0).unwrap(); + let (k_min, _) = counts + .iter() + .enumerate() + .find(|(_, v)| **v != M::ZERO) + .unwrap(); let k_min_prime = cmp::max(1, k_min); let (k_max, _) = counts .iter() .enumerate() .rev() - .find(|(_, v)| **v != 0) + .find(|(_, v)| **v != M::ZERO) .unwrap(); let k_max_prime = cmp::min(q, k_max); let mut z = 0.; for i in num_iter::range_step_inclusive(k_max_prime as i32, k_min_prime as i32, -1) { - z = 0.5 * z + counts[i as usize] as f64; + z = 0.5 * z + counts[i as usize].to_f64(); } // ldexp(x, i) = x * (2 ** i) @@ -44,9 +97,9 @@ pub fn mle(counts: &[u16], p: usize, q: usize, relerr: f64) -> f64 { } let mut g_prev = 0.; - let a = z + (counts[0] as f64); - let b = z + (counts[q + 1] as f64) * 2f64.powi(-(q as i32)); - let m_prime = (m - counts[0]) as f64; + let a = z + (counts[0].to_f64()); + let b = z + (counts[q + 1].to_f64()) * 2f64.powi(-(q as i32)); + let m_prime = (m - counts[0]).to_f64(); let mut x = if b <= 1.5 * a { // weak lower bound (47) @@ -57,7 +110,7 @@ pub fn mle(counts: &[u16], p: usize, q: usize, relerr: f64) -> f64 { }; let mut delta_x = x; - let del = relerr / (m as f64).sqrt(); + let del = relerr / m.to_f64().sqrt(); while delta_x > x * del { // secant method iteration @@ -78,13 +131,13 @@ pub fn mle(counts: &[u16], p: usize, q: usize, relerr: f64) -> f64 { } // compare (53) - let mut g = c_prime as f64 * h; + let mut g = c_prime.to_f64() * h; for k in num_iter::range_step_inclusive(k_max_prime as i32 - 1, k_min_prime as i32, -1) { let h_prime = 1. - h; // Calculate h(x/2^k), see (56), at this point x_prime = x / (2^(k+2)) h = (x_prime + h * h_prime) / (x_prime + h_prime); - g += counts[k as usize] as f64 * h; + g += counts[k as usize].to_f64() * h; x_prime += x_prime; } @@ -100,7 +153,7 @@ pub fn mle(counts: &[u16], p: usize, q: usize, relerr: f64) -> f64 { g_prev = g } - m as f64 * x + m.to_f64() * x } /// Calculate the joint maximum likelihood of A and B. @@ -111,36 +164,59 @@ pub fn joint_mle( k2: &[CounterType], p: usize, q: usize, -) -> (usize, usize, usize) { - let mut c1 = vec![0; q + 2]; - let mut c2 = vec![0; q + 2]; - let mut cu = vec![0; q + 2]; - let mut cg1 = vec![0; q + 2]; - let mut cg2 = vec![0; q + 2]; - let mut ceq = vec![0; q + 2]; +) -> (usize, usize, usize) +{ + if p < 8 { + joint_mle_dispatch::(k1, k2, p, q) + } else if p < 16 { + joint_mle_dispatch::(k1, k2, p, q) + } else { + assert!(p == 16 || p == 17 || p == 18); + joint_mle_dispatch::(k1, k2, p, q) + } +} + +/// Calculate the joint maximum likelihood of A and B. +/// +/// Returns a tuple (only in A, only in B, intersection) +fn joint_mle_dispatch( + k1: &[CounterType], + k2: &[CounterType], + p: usize, + q: usize, +) -> (usize, usize, usize) +where + >::Error: std::fmt::Debug, +{ + let mut c1 = vec![M::ZERO; q + 2]; + let mut c2 = vec![M::ZERO; q + 2]; + let mut cu = vec![M::ZERO; q + 2]; + let mut cg1 = vec![M::ZERO; q + 2]; + let mut cg2 = vec![M::ZERO; q + 2]; + let mut ceq = vec![M::ZERO; q + 2]; for (k1_, k2_) in k1.iter().zip(k2.iter()) { match k1_.cmp(k2_) { cmp::Ordering::Less => { - c1[*k1_ as usize] += 1; - cg2[*k2_ as usize] += 1; + c1[*k1_ as usize] += M::ONE; + cg2[*k2_ as usize] += M::ONE; } cmp::Ordering::Greater => { - cg1[*k1_ as usize] += 1; - c2[*k2_ as usize] += 1; + cg1[*k1_ as usize] += M::ONE; + c2[*k2_ as usize] += M::ONE; } cmp::Ordering::Equal => { - ceq[*k1_ as usize] += 1; + ceq[*k1_ as usize] += M::ONE; } } - cu[*cmp::max(k1_, k2_) as usize] += 1; + cu[*cmp::max(k1_, k2_) as usize] += M::ONE; } - for (i, (v, u)) in cg1.iter().zip(ceq.iter()).enumerate() { + for (i, (&v, &u)) in cg1.iter().zip(ceq.iter()).enumerate() { c1[i] += v + u; } - for (i, (v, u)) in cg2.iter().zip(ceq.iter()).enumerate() { + for (i, (&v, &u)) in cg2.iter().zip(ceq.iter()).enumerate() { c2[i] += v + u; } @@ -148,20 +224,22 @@ pub fn joint_mle( let c_bx = mle(&c2, p, q, 0.01); let c_abx = mle(&cu, p, q, 0.01); - let mut counts_axb_half = vec![0u16; q + 2]; - let mut counts_bxa_half = vec![0u16; q + 2]; + let mut counts_axb_half = vec![M::ZERO; q + 2]; + let mut counts_bxa_half = vec![M::ZERO; q + 2]; - counts_axb_half[q] = k1.len() as u16; - counts_bxa_half[q] = k2.len() as u16; + counts_axb_half[q] = M::try_from(k1.len()).unwrap(); + counts_bxa_half[q] = M::try_from(k2.len()).unwrap(); for _q in 0..q { counts_axb_half[_q] = cg1[_q] + ceq[_q] + cg2[_q + 1]; debug_assert!(counts_axb_half[q] >= counts_axb_half[_q]); - counts_axb_half[q] -= counts_axb_half[_q]; + let multeplicity_q = counts_axb_half[_q]; + counts_axb_half[q] -= multeplicity_q; counts_bxa_half[_q] = cg2[_q] + ceq[_q] + cg1[_q + 1]; debug_assert!(counts_bxa_half[q] >= counts_bxa_half[_q]); - counts_bxa_half[q] -= counts_bxa_half[_q]; + let multeplicity_q = counts_bxa_half[_q]; + counts_bxa_half[q] -= multeplicity_q; } let c_axb_half = mle(&counts_axb_half, p, q - 1, 0.01); diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index ee09caa6e5..2746df6f97 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -81,9 +81,29 @@ impl HyperLogLog { } pub fn cardinality(&self) -> usize { - let counts = estimators::counts(&self.registers, self.q); - - estimators::mle(&counts, self.p, self.q, 0.01) as usize + if self.p < 8 { + estimators::mle( + &estimators::counts::(&self.registers, self.q), + self.p, + self.q, + 0.01, + ) as usize + } else if self.p < 16 { + estimators::mle( + &estimators::counts::(&self.registers, self.q), + self.p, + self.q, + 0.05, + ) as usize + } else { + assert!(self.p == 16 || self.p == 17 || self.p == 18); + estimators::mle( + &estimators::counts::(&self.registers, self.q), + self.p, + self.q, + 0.1, + ) as usize + } } pub fn similarity(&self, other: &HyperLogLog) -> f64 { @@ -224,8 +244,10 @@ impl Update for KmerMinHash { #[cfg(test)] mod test { use std::collections::HashSet; + use std::hash::{DefaultHasher, Hash}; use std::io::{BufReader, BufWriter, Read}; use std::path::PathBuf; + use std::hash::Hasher; use crate::signature::SigsTrait; use needletail::{parse_fastx_file, parse_fastx_reader, Sequence}; @@ -374,4 +396,53 @@ mod test { assert_eq!(hll_new.registers, hll.registers); assert_eq!(hll_new.ksize, hll.ksize); } + + #[test] + /// Test to cover corner cases in the MLE calculation + /// that may happen at resolutions 16, 17 or 18, i.e. + /// cases with 2^16 == 65536, 2^17 == 131072, 2^18 == 262144. + /// + /// In such cases, the MLE multeplicities which were earlier + /// implemented always using a u16 type, may overflow. + fn test_mle_corner_cases() { + for precision in [16, 17, 18] { + let mut hll = HyperLogLog::new(precision, 21).unwrap(); + for i in 1..5000 { + let mut hasher = DefaultHasher::new(); + i.hash(&mut hasher); + let hash = hasher.finish(); + hll.add_hash(hash) + } + + let cardinality = hll.cardinality(); + + assert!(cardinality > 4500 && cardinality < 5500); + + // We build a second hll to check whether the union of the two + // hlls is consistent with the cardinality of the union. + let mut hll2 = HyperLogLog::new(precision, 21).unwrap(); + + for i in 5000..10000 { + let mut hasher = DefaultHasher::new(); + i.hash(&mut hasher); + let hash = hasher.finish(); + hll2.add_hash(hash) + } + + let mut hll_union = hll.clone(); + hll_union.merge(&hll2).unwrap(); + let cardinality_union = hll_union.cardinality(); + + assert!( + cardinality_union > 9500 && cardinality_union < 10500, + "precision: {}, cardinality_union: {}", + precision, + cardinality_union + ); + + let intersection = hll.intersection(&hll2); + + assert!(intersection < 500); + } + } } From f9bdf12750bd3b1909390b0fb2cd2f220e460bd5 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:46:51 +0200 Subject: [PATCH 02/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 7d36461ae0..b456ce165a 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -6,7 +6,7 @@ use core::{ pub type CounterType = u8; /// Trait for types that can be used as multiplicity integers. -pub trait MulteplicityInteger: +pub trait MultiplicityInteger: Shl + Copy + AddAssign From 3e95e6e37888cb439046e59f15f92b2f0626088c Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:46:56 +0200 Subject: [PATCH 03/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index b456ce165a..b35a8284be 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -26,7 +26,7 @@ pub trait MultiplicityInteger: fn to_f64(self) -> f64; } -macro_rules! impl_multeplicity_integer { +macro_rules! impl_multiplicity_integer { ($($t:ty),*) => { $( impl MulteplicityInteger for $t { From 986c7e206715c40f875b09df1e15104e18ae4a98 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:01 +0200 Subject: [PATCH 04/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index b35a8284be..11e418ba80 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -29,7 +29,7 @@ pub trait MultiplicityInteger: macro_rules! impl_multiplicity_integer { ($($t:ty),*) => { $( - impl MulteplicityInteger for $t { + impl MultiplicityInteger for $t { const ONE: Self = 1; const ZERO: Self = 0; From 519ee168508bea6b90d3df1b910972535a1be43d Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:05 +0200 Subject: [PATCH 05/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 11e418ba80..0db2ee1b8b 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -41,7 +41,7 @@ macro_rules! impl_multiplicity_integer { }; } -impl_multeplicity_integer!(u8, u16, u32); +impl_multiplicity_integer!(u8, u16, u32); pub fn counts(registers: &[CounterType], q: usize) -> Vec { let mut counts = vec![M::ZERO; q + 2]; From 995f9b5c522360f405d9506a7ebaba24f7249a41 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:11 +0200 Subject: [PATCH 06/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 0db2ee1b8b..e0d526eab7 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -43,7 +43,7 @@ macro_rules! impl_multiplicity_integer { impl_multiplicity_integer!(u8, u16, u32); -pub fn counts(registers: &[CounterType], q: usize) -> Vec { +pub fn counts(registers: &[CounterType], q: usize) -> Vec { let mut counts = vec![M::ZERO; q + 2]; for k in registers { From 111d38b3c313170fffb5ad55b05f22b805cc1972 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:16 +0200 Subject: [PATCH 07/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index e0d526eab7..7e7c315f2b 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -233,7 +233,7 @@ where for _q in 0..q { counts_axb_half[_q] = cg1[_q] + ceq[_q] + cg2[_q + 1]; debug_assert!(counts_axb_half[q] >= counts_axb_half[_q]); - let multeplicity_q = counts_axb_half[_q]; + let multiplicity_q = counts_axb_half[_q]; counts_axb_half[q] -= multeplicity_q; counts_bxa_half[_q] = cg2[_q] + ceq[_q] + cg1[_q + 1]; From bfda7fe32e5a6e786410c611c392a629fe43fce1 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:22 +0200 Subject: [PATCH 08/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 7e7c315f2b..b2dd271257 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -234,7 +234,7 @@ where counts_axb_half[_q] = cg1[_q] + ceq[_q] + cg2[_q + 1]; debug_assert!(counts_axb_half[q] >= counts_axb_half[_q]); let multiplicity_q = counts_axb_half[_q]; - counts_axb_half[q] -= multeplicity_q; + counts_axb_half[q] -= multiplicity_q; counts_bxa_half[_q] = cg2[_q] + ceq[_q] + cg1[_q + 1]; debug_assert!(counts_bxa_half[q] >= counts_bxa_half[_q]); From 4b8a4ed4e54c921b303d9022c96b048700327081 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:27 +0200 Subject: [PATCH 09/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index b2dd271257..e49b31996c 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -239,7 +239,7 @@ where counts_bxa_half[_q] = cg2[_q] + ceq[_q] + cg1[_q + 1]; debug_assert!(counts_bxa_half[q] >= counts_bxa_half[_q]); let multeplicity_q = counts_bxa_half[_q]; - counts_bxa_half[q] -= multeplicity_q; + counts_bxa_half[q] -= multiplicity_q; } let c_axb_half = mle(&counts_axb_half, p, q - 1, 0.01); From f80c9596e0f04c1865a1eb99ad86c94e3bacbf8a Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:33 +0200 Subject: [PATCH 10/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index e49b31996c..2b73feb8f1 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -238,7 +238,7 @@ where counts_bxa_half[_q] = cg2[_q] + ceq[_q] + cg1[_q + 1]; debug_assert!(counts_bxa_half[q] >= counts_bxa_half[_q]); - let multeplicity_q = counts_bxa_half[_q]; + let multiplicity_q = counts_bxa_half[_q]; counts_bxa_half[q] -= multiplicity_q; } From 1545606e61bb9b45e76d2fdde64b0b3bcb1312e0 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:47:40 +0200 Subject: [PATCH 11/15] Update src/core/src/sketch/hyperloglog/mod.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index 2746df6f97..cfecfe5f1d 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -402,7 +402,7 @@ mod test { /// that may happen at resolutions 16, 17 or 18, i.e. /// cases with 2^16 == 65536, 2^17 == 131072, 2^18 == 262144. /// - /// In such cases, the MLE multeplicities which were earlier + /// In such cases, the MLE multiplicities which were earlier /// implemented always using a u16 type, may overflow. fn test_mle_corner_cases() { for precision in [16, 17, 18] { From cb7611d0abe01266390cbce6f7bb06176c6ae209 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:53:41 +0200 Subject: [PATCH 12/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 2b73feb8f1..c2a60f6ce3 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -54,7 +54,7 @@ pub fn counts(registers: &[CounterType], q: usize) -> Ve } #[allow(clippy::many_single_char_names)] -pub fn mle(counts: &[M], p: usize, q: usize, relerr: f64) -> f64 { +pub fn mle(counts: &[M], p: usize, q: usize, relerr: f64) -> f64 { let m: M = M::ONE << p; // If all of the registers are equal to zero, then we return zero. From 7750f14102cbd4c2e0024615f4672472f652c2e6 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Sun, 18 Aug 2024 18:53:48 +0200 Subject: [PATCH 13/15] Update src/core/src/sketch/hyperloglog/estimators.rs Co-authored-by: Luiz Irber --- src/core/src/sketch/hyperloglog/estimators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index c2a60f6ce3..3711193d3a 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -179,7 +179,7 @@ pub fn joint_mle( /// Calculate the joint maximum likelihood of A and B. /// /// Returns a tuple (only in A, only in B, intersection) -fn joint_mle_dispatch( +fn joint_mle_dispatch( k1: &[CounterType], k2: &[CounterType], p: usize, From 6b168dad96bc72d9c3088d566357a4b97a84d073 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 18 Aug 2024 12:48:36 -0700 Subject: [PATCH 14/15] fmt --- src/core/src/sketch/hyperloglog/estimators.rs | 3 +-- src/core/src/sketch/hyperloglog/mod.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs index 3711193d3a..4f2a01449c 100644 --- a/src/core/src/sketch/hyperloglog/estimators.rs +++ b/src/core/src/sketch/hyperloglog/estimators.rs @@ -164,8 +164,7 @@ pub fn joint_mle( k2: &[CounterType], p: usize, q: usize, -) -> (usize, usize, usize) -{ +) -> (usize, usize, usize) { if p < 8 { joint_mle_dispatch::(k1, k2, p, q) } else if p < 16 { diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index f1172bdfc2..ab26a78adf 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -251,10 +251,10 @@ impl Update for KmerMinHash { #[cfg(test)] mod test { use std::collections::HashSet; + use std::hash::Hasher; use std::hash::{DefaultHasher, Hash}; use std::io::{BufReader, BufWriter, Read}; use std::path::PathBuf; - use std::hash::Hasher; use crate::signature::SigsTrait; use needletail::{parse_fastx_file, parse_fastx_reader, Sequence}; From 6c5cd9b8aea384da925048d2ee54196bbf7e0638 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 18 Aug 2024 12:49:39 -0700 Subject: [PATCH 15/15] Add Luca ORCID --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c523e919c9..2583debf52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ authors = [ { name="Colton Baumler", orcid="0000-0002-5926-7792" }, { name="Olga Botvinnik", orcid="0000-0003-4412-7970" }, { name="Phillip Brooks", orcid="0000-0003-3987-244X" }, + { name="Luca Cappelletti", orcid="0000-0002-1269-2038" }, { name="Peter Cock", orcid="0000-0001-9513-9993" }, { name="Daniel Dsouza", orcid="0000-0001-7843-8596" }, { name="Jade Gardner", orcid="0009-0005-0787-5752" },