diff --git a/Cargo.lock b/Cargo.lock index cc2c0afab..a65f4bc09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -277,6 +277,24 @@ dependencies = [ "thiserror", ] +[[package]] +name = "bio_inline" +version = "1.3.1" +dependencies = [ + "bio-types", + "bit-set", + "bv", + "bytecount", + "fxhash", + "lazy_static", + "num", + "num-integer", + "num-traits", + "rand", + "serde", + "vec_map", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -355,6 +373,9 @@ name = "bytecount" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +dependencies = [ + "packed_simd_2", +] [[package]] name = "bytemuck" @@ -1581,6 +1602,12 @@ version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +[[package]] +name = "libm" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fc7aa29613bd6a620df431842069224d8bc9011086b1db4c0e0cd47fa03ec9a" + [[package]] name = "libm" version = "0.2.7" @@ -1747,6 +1774,7 @@ dependencies = [ "auto_ops", "bio", "bio-types", + "bio_inline", "bzip2", "chrono", "clap", @@ -1935,7 +1963,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", - "libm", + "libm 0.2.7", ] [[package]] @@ -2004,6 +2032,16 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" +[[package]] +name = "packed_simd_2" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1914cd452d8fccd6f9db48147b29fd4ae05bea9dc5d9ad578509f72415de282" +dependencies = [ + "cfg-if 1.0.0", + "libm 0.1.4", +] + [[package]] name = "parking_lot" version = "0.12.1" diff --git a/Cargo.toml b/Cargo.toml index 02291b49b..09014f8b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,8 +2,9 @@ members = [ "packages_rs/*", + "packages_rs/3rdparty/bio_inline", ] exclude = [ - "packages_rs/3rdparty" + "packages_rs/3rdparty", ] diff --git a/packages_rs/3rdparty/bio_inline/Cargo.toml b/packages_rs/3rdparty/bio_inline/Cargo.toml new file mode 100644 index 000000000..5104d4638 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/Cargo.toml @@ -0,0 +1,24 @@ +[package] +edition = "2021" +name = "bio_inline" +version = "1.3.1" + +[dependencies] +bio-types = "=1.0.0" +bit-set = "=0.5.3" +bv = { version = "=0.11.1", features = ["serde"] } +bytecount = "=0.6.3" +fxhash = "=0.2.1" +lazy_static = "=1.4.0" +num = "=0.4.0" +num-integer = "0.1.45" +num-traits = "=0.2.15" +serde = { version = "=1.0.164", features = ["derive"] } +vec_map = { version = "=0.8.2", features = ["serde"] } + +[features] +generic-simd = ["bytecount/generic-simd"] +runtime-dispatch-simd = ["bytecount/runtime-dispatch-simd"] + +[dev-dependencies] +rand = "=0.8.5" diff --git a/packages_rs/3rdparty/bio_inline/LICENSE.md b/packages_rs/3rdparty/bio_inline/LICENSE.md new file mode 100644 index 000000000..f295af393 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/LICENSE.md @@ -0,0 +1,9 @@ +The MIT License (MIT) + +Copyright (c) 2016 Johannes Köster, the Rust-Bio team, Google Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/packages_rs/3rdparty/bio_inline/README.md b/packages_rs/3rdparty/bio_inline/README.md new file mode 100644 index 000000000..836532d2a --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/README.md @@ -0,0 +1,16 @@ +# bio_inline + +Bits and pieces from "bio" crate (https://github.com/rust-bio/rust-bio), adapted to the needs of this project. + +#### How to add more code from bio crate: + +- add required source files from `bio` project's `src` directory to `bio_inline/src` directory here, preserving directory structure +- find and replace `bio::` with `bio_inline::` in all new files +- remove unused code and unused `use` imports recursively +- (optional) lint and format the new code +- (optional) modify the new code as needed (this is probably why you are doing this procedure) + +#### How to switch project code from using `bio` to using `bio_inline`: + +- find and replace `bio::` to `bio_inline::` +- add missing library code recursively (see previous section) until project compiles diff --git a/packages_rs/3rdparty/bio_inline/src/alphabets/dna.rs b/packages_rs/3rdparty/bio_inline/src/alphabets/dna.rs new file mode 100644 index 000000000..342a5c450 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/alphabets/dna.rs @@ -0,0 +1,114 @@ +// Copyright 2014-2015 Johannes Köster, Peer Aramillo Irizar. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! Implementation of the DNA alphabet. +//! +//! # Example +//! +//! ``` +//! use bio_inline::alphabets; +//! let alphabet = alphabets::dna::alphabet(); +//! assert!(alphabet.is_word(b"GATTACA")); +//! assert!(alphabet.is_word(b"gattaca")); +//! assert!(!alphabet.is_word(b"ACGU")); +//! ``` + +use crate::alphabets::Alphabet; +use lazy_static::lazy_static; +use std::borrow::Borrow; + +/// The DNA alphabet (uppercase and lowercase). +pub fn alphabet() -> Alphabet { + Alphabet::new(b"ACGTacgt") +} + +/// The DNA alphabet including N (uppercase and lowercase). +pub fn n_alphabet() -> Alphabet { + Alphabet::new(b"ACGTNacgtn") +} + +/// The IUPAC DNA alphabet (uppercase and lowercase). +pub fn iupac_alphabet() -> Alphabet { + Alphabet::new(b"ACGTRYSWKMBDHVNZacgtryswkmbdhvnz") +} + +lazy_static! { + static ref COMPLEMENT: [u8; 256] = { + let mut comp = [0; 256]; + for (v, a) in comp.iter_mut().enumerate() { + *a = v as u8; + } + for (&a, &b) in b"AGCTYRWSKMDVHBN".iter().zip(b"TCGARYWSMKHBDVN".iter()) { + comp[a as usize] = b; + comp[a as usize + 32] = b + 32; // lowercase variants + } + comp + }; +} + +/// Return complement of given DNA alphabet character (IUPAC alphabet supported). +/// +/// Casing of input character is preserved, e.g. `t` → `a`, but `T` → `A`. +/// All `N`s remain as they are. +/// +/// ``` +/// use bio_inline::alphabets::dna; +/// +/// assert_eq!(dna::complement(65), 84); // A → T +/// assert_eq!(dna::complement(99), 103); // c → g +/// assert_eq!(dna::complement(78), 78); // N → N +/// assert_eq!(dna::complement(89), 82); // Y → R +/// assert_eq!(dna::complement(115), 115); // s → s +/// ``` +#[inline] +pub fn complement(a: u8) -> u8 { + COMPLEMENT[a as usize] +} + +/// Calculate reverse complement of given text (IUPAC alphabet supported). +/// +/// Casing of characters is preserved, e.g. `b"NaCgT"` → `b"aCgTN"`. +/// All `N`s remain as they are. +/// +/// ``` +/// use bio_inline::alphabets::dna; +/// +/// assert_eq!(dna::revcomp(b"ACGTN"), b"NACGT"); +/// assert_eq!(dna::revcomp(b"GaTtaCA"), b"TGtaAtC"); +/// assert_eq!(dna::revcomp(b"AGCTYRWSKMDVHBN"), b"NVDBHKMSWYRAGCT"); +/// ``` +pub fn revcomp(text: T) -> Vec +where + C: Borrow, + T: IntoIterator, + T::IntoIter: DoubleEndedIterator, +{ + text.into_iter().rev().map(|a| complement(*a.borrow())).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn is_word() { + assert!(alphabet().is_word(b"GATTACA")); + } + + #[test] + fn is_no_word() { + assert!(!alphabet().is_word(b"gaUUaca")); + } + + #[test] + fn symbol_is_no_word() { + assert!(!alphabet().is_word(b"#")); + } + + #[test] + fn number_is_no_word() { + assert!(!alphabet().is_word(b"42")); + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/alphabets/mod.rs b/packages_rs/3rdparty/bio_inline/src/alphabets/mod.rs new file mode 100644 index 000000000..93050c024 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/alphabets/mod.rs @@ -0,0 +1,459 @@ +// Copyright 2014-2015 Johannes Köster, Peer Aramillo Irizar. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! Implementation of alphabets and useful utilities. +//! +//! # Example +//! +//! ```rust +//! use bio_inline::alphabets; +//! let alphabet = alphabets::dna::alphabet(); +//! assert!(alphabet.is_word(b"AACCTgga")); +//! assert!(!alphabet.is_word(b"AXYZ")); +//! ``` + +use bit_set::BitSet; +use serde::{Deserialize, Serialize}; +use std::borrow::Borrow; +use std::mem; +use vec_map::VecMap; +pub mod dna; + +pub type SymbolRanks = VecMap; + +/// Representation of an alphabet. +#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] +#[must_use] +pub struct Alphabet { + pub symbols: BitSet, +} + +impl Alphabet { + /// Create new alphabet from given symbols. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// // Create an alphabet (note that a DNA alphabet is already available in bio_inline::alphabets::dna). + /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt"); + /// // Check whether a given text is a word over the alphabet. + /// assert!(dna_alphabet.is_word(b"GAttACA")); + /// ``` + pub fn new(symbols: T) -> Self + where + C: Borrow, + T: IntoIterator, + { + let mut s = BitSet::new(); + s.extend(symbols.into_iter().map(|c| *c.borrow() as usize)); + + Alphabet { symbols: s } + } + + /// Insert symbol into alphabet. + /// + /// Complexity: O(1) + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let mut dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt"); + /// assert!(!dna_alphabet.is_word(b"N")); + /// dna_alphabet.insert(78); + /// assert!(dna_alphabet.is_word(b"N")); + /// ``` + pub fn insert(&mut self, a: u8) { + self.symbols.insert(a as usize); + } + + /// Check if given text is a word over the alphabet. + /// + /// Complexity: O(n), where n is the length of the text. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt"); + /// assert!(dna_alphabet.is_word(b"GAttACA")); + /// assert!(!dna_alphabet.is_word(b"42")); + /// ``` + pub fn is_word(&self, text: T) -> bool + where + C: Borrow, + T: IntoIterator, + { + text.into_iter().all(|c| self.symbols.contains(*c.borrow() as usize)) + } + + /// Return lexicographically maximal symbol. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// assert_eq!(dna_alphabet.max_symbol(), Some(116)); // max symbol is "t" + /// let empty_alphabet = alphabets::Alphabet::new(b""); + /// assert_eq!(empty_alphabet.max_symbol(), None); + /// ``` + pub fn max_symbol(&self) -> Option { + self.symbols.iter().max().map(|a| a as u8) + } + + /// Return size of the alphabet. + /// + /// Upper and lower case representations of the same character + /// are counted as distinct characters. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// assert_eq!(dna_alphabet.len(), 8); + /// ``` + pub fn len(&self) -> usize { + self.symbols.len() + } + + /// Is this alphabet empty? + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// assert!(!dna_alphabet.is_empty()); + /// let empty_alphabet = alphabets::Alphabet::new(b""); + /// assert!(empty_alphabet.is_empty()); + /// ``` + pub fn is_empty(&self) -> bool { + self.symbols.is_empty() + } + + /// Return a new alphabet taking the intersect between this and other. + /// + /// # Example + /// ``` + /// use bio_inline::alphabets; + /// + /// let alpha_a = alphabets::Alphabet::new(b"acgtACGT"); + /// let alpha_b = alphabets::Alphabet::new(b"atcgMVP"); + /// let intersect_alpha = alpha_a.intersection(&alpha_b); + /// + /// assert_eq!(intersect_alpha, alphabets::Alphabet::new(b"atcg")); + /// ``` + pub fn intersection(&self, other: &Alphabet) -> Self { + return Alphabet { + symbols: self.symbols.intersection(&other.symbols).collect(), + }; + } + + /// Return a new alphabet taking the difference between this and other. + /// + /// # Example + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// let dna_alphabet_upper = alphabets::Alphabet::new(b"ACGT"); + /// let dna_lower = dna_alphabet.difference(&dna_alphabet_upper); + /// + /// assert_eq!(dna_lower, alphabets::Alphabet::new(b"atcg")); + /// ``` + pub fn difference(&self, other: &Alphabet) -> Self { + return Alphabet { + symbols: self.symbols.difference(&other.symbols).collect(), + }; + } + + /// Return a new alphabet taking the union between this and other. + /// + /// # Example + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"ATCG"); + /// let tokenize_alpha = alphabets::Alphabet::new(b"?|"); + /// let alpha = dna_alphabet.union(&tokenize_alpha); + /// + /// assert_eq!(alpha, alphabets::Alphabet::new(b"ATCG?|")); + /// ``` + pub fn union(&self, other: &Alphabet) -> Self { + return Alphabet { + symbols: self.symbols.union(&other.symbols).collect(), + }; + } +} + +/// Tools based on transforming the alphabet symbols to their lexicographical ranks. +/// +/// Lexicographical rank is computed using `u8` representations, +/// i.e. ASCII codes, of the input characters. +/// For example, assuming that the alphabet consists of the symbols `A`, `C`, `G`, and `T`, this +/// will yield ranks `0`, `1`, `2`, `3` for them, respectively. +/// +/// `RankTransform` can be used in to perform bit encoding for texts over a +/// given alphabet via `bio_inline::data_structures::bitenc`. +#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct RankTransform { + pub ranks: SymbolRanks, +} + +impl RankTransform { + /// Construct a new `RankTransform`. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// ``` + pub fn new(alphabet: &Alphabet) -> Self { + let mut ranks = VecMap::new(); + for (r, c) in alphabet.symbols.iter().enumerate() { + ranks.insert(c, r as u8); + } + + RankTransform { ranks } + } + + /// Get the rank of symbol `a`. + /// + /// This method panics for characters not contained in the alphabet. + /// + /// Complexity: O(1) + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// assert_eq!(dna_ranks.get(65), 0); // "A" + /// assert_eq!(dna_ranks.get(116), 7); // "t" + /// ``` + pub fn get(&self, a: u8) -> u8 { + *self.ranks.get(a as usize).expect("Unexpected character.") + } + + /// Transform a given `text` into a vector of rank values. + /// + /// Complexity: O(n), where n is the length of the text. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// let text = b"aAcCgGtT"; + /// assert_eq!(dna_ranks.transform(text), vec![4, 0, 5, 1, 6, 2, 7, 3]); + /// ``` + pub fn transform(&self, text: T) -> Vec + where + C: Borrow, + T: IntoIterator, + { + text + .into_iter() + .map(|c| { + *self + .ranks + .get(*c.borrow() as usize) + .expect("Unexpected character in text.") + }) + .collect() + } + + /// Iterate over q-grams (substrings of length q) of given `text`. The q-grams are encoded + /// as `usize` by storing the symbol ranks in log2(|A|) bits (with |A| being the alphabet size). + /// + /// If q is larger than usize::BITS / log2(|A|), this method fails with an assertion. + /// + /// Complexity: O(n), where n is the length of the text. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// + /// let q_grams: Vec = dna_ranks.qgrams(2, b"ACGT").collect(); + /// assert_eq!(q_grams, vec![1, 10, 19]); + /// ``` + pub fn qgrams(&self, q: u32, text: T) -> QGrams<'_, C, T::IntoIter> + where + C: Borrow, + T: IntoIterator, + { + let bits = (self.ranks.len() as f32).log2().ceil() as u32; + assert!( + (bits * q) as usize <= mem::size_of::() * 8, + "Expecting q to be smaller than usize / log2(|A|)" + ); + + let mut qgrams = QGrams { + text: text.into_iter(), + ranks: self, + bits, + mask: 1_usize.checked_shl(q * bits).unwrap_or(0).wrapping_sub(1), + qgram: 0, + }; + + for _ in 0..q - 1 { + qgrams.next(); + } + + qgrams + } + + /// Restore alphabet from transform. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// assert_eq!(dna_ranks.alphabet().symbols, dna_alphabet.symbols); + /// ``` + pub fn alphabet(&self) -> Alphabet { + let mut symbols = BitSet::with_capacity(self.ranks.len()); + symbols.extend(self.ranks.keys()); + Alphabet { symbols } + } + + /// Compute the number of bits required to encode the largest rank value. + /// + /// For example, the alphabet `b"ACGT"` with 4 symbols has the maximal rank + /// 3, which can be encoded in 2 bits. + /// + /// This value can be used to create a `data_structures::bitenc::BitEnc` + /// bit encoding tailored to the given alphabet. + /// + /// Complexity: O(n), where n is the number of symbols in the alphabet. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets; + /// + /// let dna_alphabet = alphabets::Alphabet::new(b"ACGT"); + /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet); + /// assert_eq!(dna_ranks.get_width(), 2); + /// let dna_n_alphabet = alphabets::Alphabet::new(b"ACGTN"); + /// let dna_n_ranks = alphabets::RankTransform::new(&dna_n_alphabet); + /// assert_eq!(dna_n_ranks.get_width(), 3); + /// ``` + pub fn get_width(&self) -> usize { + (self.ranks.len() as f32).log2().ceil() as usize + } +} + +/// Iterator over q-grams. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)] +pub struct QGrams<'a, C, T> +where + C: Borrow, + T: Iterator, +{ + text: T, + ranks: &'a RankTransform, + bits: u32, + mask: usize, + qgram: usize, +} + +impl<'a, C, T> QGrams<'a, C, T> +where + C: Borrow, + T: Iterator, +{ + /// Push a new character into the current qgram. + fn qgram_push(&mut self, a: u8) { + self.qgram <<= self.bits; + self.qgram |= a as usize; + self.qgram &= self.mask; + } +} + +impl<'a, C, T> Iterator for QGrams<'a, C, T> +where + C: Borrow, + T: Iterator, +{ + type Item = usize; + + fn next(&mut self) -> Option { + match self.text.next() { + Some(a) => { + let b = self.ranks.get(*a.borrow()); + self.qgram_push(b); + Some(self.qgram) + } + None => None, + } + } +} + +/// Returns the english ascii lower case alphabet. +pub fn english_ascii_lower_alphabet() -> Alphabet { + Alphabet::new(&b"abcdefghijklmnopqrstuvwxyz"[..]) +} + +/// Returns the english ascii upper case alphabet. +pub fn english_ascii_upper_alphabet() -> Alphabet { + Alphabet::new(&b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"[..]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_alphabet_eq() { + assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATCG")); + assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"TAGC")); + assert_ne!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATC")); + } + + /// When `q * bits == usize::BITS`, make sure that `1<<(1*bits)` does not overflow. + #[test] + fn test_qgram_shiftleft_overflow() { + let alphabet = Alphabet::new(b"ACTG"); + let transform = RankTransform::new(&alphabet); + let text = b"ACTG".repeat(100); + transform.qgrams(usize::BITS / 2, text); + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/data_structures/bwt.rs b/packages_rs/3rdparty/bio_inline/src/data_structures/bwt.rs new file mode 100644 index 000000000..7fe433a3f --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/data_structures/bwt.rs @@ -0,0 +1,243 @@ +// Copyright 2014-2016 Johannes Köster, Taylor Cramer. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! The [Burrows-Wheeler-Transform](https://www.semanticscholar.org/paper/A-Block-sorting-Lossless-Data-Compression-Algorithm-Burrows-Wheeler/af56e6d4901dcd0f589bf969e604663d40f1be5d) and related data structures. +//! The implementation is based on the lecture notes +//! "Algorithmen auf Sequenzen", Kopczynski, Marschall, Martin and Rahmann, 2008 - 2015. + +use crate::alphabets::Alphabet; +use crate::data_structures::suffix_array::RawSuffixArraySlice; +use crate::utils::prescan; +use serde::{Deserialize, Serialize}; +use std::iter::repeat; + +pub type BWT = Vec; +pub type BWTSlice = [u8]; +pub type Less = Vec; +pub type BWTFind = Vec; + +/// Calculate Burrows-Wheeler-Transform of the given text of length n. +/// Complexity: O(n). +/// +/// # Arguments +/// +/// * `text` - the text ended by sentinel symbol (being lexicographically smallest) +/// * `pos` - the suffix array for the text +/// +/// # Example +/// +/// ``` +/// use bio_inline::data_structures::bwt::bwt; +/// use bio_inline::data_structures::suffix_array::suffix_array; +/// let text = b"GCCTTAACATTATTACGCCTA$"; +/// let pos = suffix_array(text); +/// let bwt = bwt(text, &pos); +/// assert_eq!(bwt, b"ATTATTCAGGACCC$CTTTCAA"); +/// ``` +pub fn bwt(text: &[u8], pos: RawSuffixArraySlice) -> BWT { + assert_eq!(text.len(), pos.len()); + let n = text.len(); + let mut bwt: BWT = repeat(0).take(n).collect(); + for r in 0..n { + let p = pos[r]; + bwt[r] = if p > 0 { text[p - 1] } else { text[n - 1] }; + } + + bwt +} + +/// Calculate the inverse of a BWT of length n, which is the original text. +/// Complexity: O(n). +/// +/// This only works if the last sentinel in the original text is unique +/// and lexicographically the smallest. +/// +/// # Arguments +/// +/// * `bwt` - the BWT +pub fn invert_bwt(bwt: &BWTSlice) -> Vec { + let alphabet = Alphabet::new(bwt); + let n = bwt.len(); + let bwtfind = bwtfind(bwt, &alphabet); + let mut inverse = Vec::with_capacity(n); + + let mut r = bwtfind[0]; + for _ in 0..n { + r = bwtfind[r]; + inverse.push(bwt[r]); + } + + inverse +} + +/// An occurrence array implementation. +#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct Occ { + occ: Vec>, + k: u32, +} + +impl Occ { + /// Calculate occ array with sampling from BWT of length n. + /// Time complexity: O(n). + /// Space complexity: O(n / k * A) with A being the alphabet size. + /// The specified alphabet must match the alphabet of the text and its BWT. + /// For large texts, it is advisable to transform + /// the text before calculating the BWT (see alphabets::rank_transform). + /// + /// # Arguments + /// + /// * `bwt` - the BWT + /// * `k` - the sampling rate: every k-th entry will be stored + pub fn new(bwt: &BWTSlice, k: u32, alphabet: &Alphabet) -> Self { + let n = bwt.len(); + let m = alphabet.max_symbol().expect("Expecting non-empty alphabet.") as usize + 1; + let mut alpha = alphabet.symbols.iter().collect::>(); + // include sentinel '$' + if (b'$' as usize) < m && !alphabet.is_word(b"$") { + alpha.push(b'$' as usize); + } + let mut occ: Vec> = vec![Vec::new(); m]; + let mut curr_occ = vec![0_usize; m]; + + // characters not in the alphabet won't take up much space + for &a in &alpha { + occ[a].reserve(n / k as usize); + } + + for (i, &c) in bwt.iter().enumerate() { + curr_occ[c as usize] += 1; + + if i % k as usize == 0 { + // only visit characters in the alphabet + for &a in &alpha { + occ[a].push(curr_occ[a]); + } + } + } + + Occ { occ, k } + } + + /// Get occurrence count of symbol a in BWT[..r+1]. + /// Complexity: O(k). + pub fn get(&self, bwt: &BWTSlice, r: usize, a: u8) -> usize { + // NOTE: + // + // Retrieving byte match counts in this function is critical to the performance of FM Index. + // + // The below manual count code is roughly equivalent to: + // ``` + // let count = bwt[(i * self.k) + 1..r + 1] + // .iter() + // .filter(|&&c| c == a) + // .count(); + // self.occ[a as usize][i] + count + // ``` + // + // But there are a couple of reasons to do this manually: + // 1) As of 2016, versions of rustc/LLVM vectorize this manual loop more reliably + // than the iterator adapter version. + // 2) Manually accumulating the byte match count in a single chunk can allows + // us to use a `u32` for that count, which has faster arithmetic on common arches. + // This does necessitate storing `k` as a u32. + // + // See the conversation in these issues for some of the history here: + // + // https://github.com/rust-bio/rust-bio/pull/74 + // https://github.com/rust-bio/rust-bio/pull/76 + + // self.k is our sampling rate, so find the checkpoints either side of r. + let lo_checkpoint = r / self.k as usize; + // Get the occurences at the low checkpoint + let lo_occ = self.occ[a as usize][lo_checkpoint]; + + // If the sampling rate is infrequent it is worth checking if there is a closer + // hi checkpoint. + if self.k > 64 { + let hi_checkpoint = lo_checkpoint + 1; + if let Some(&hi_occ) = self.occ[a as usize].get(hi_checkpoint) { + // Its possible that there are no occurences between the low and high + // checkpoint in which case we bail early. + if lo_occ == hi_occ { + return lo_occ; + } + + // If r is closer to the high checkpoint, count backwards from there. + let hi_idx = hi_checkpoint * self.k as usize; + if (hi_idx - r) < (self.k as usize / 2) { + return hi_occ - bytecount::count(&bwt[r + 1..=hi_idx], a); + } + } + } + + // Otherwise the default case is to count from the low checkpoint. + let lo_idx = lo_checkpoint * self.k as usize; + bytecount::count(&bwt[lo_idx + 1..=r], a) + lo_occ + } +} + +/// Calculate the less array for a given BWT. Complexity O(n). +pub fn less(bwt: &BWTSlice, alphabet: &Alphabet) -> Less { + let m = alphabet.max_symbol().expect("Expecting non-empty alphabet.") as usize + 2; + let mut less: Less = repeat(0).take(m).collect(); + for &c in bwt.iter() { + less[c as usize] += 1; + } + // calculate +-prescan + prescan(&mut less[..], 0, |a, b| a + b); + + less +} + +/// Calculate the bwtfind array needed for inverting the BWT. Complexity O(n). +pub fn bwtfind(bwt: &BWTSlice, alphabet: &Alphabet) -> BWTFind { + let n = bwt.len(); + let mut less = less(bwt, alphabet); + + let mut bwtfind: BWTFind = repeat(0).take(n).collect(); + for (r, &c) in bwt.iter().enumerate() { + bwtfind[less[c as usize]] = r; + less[c as usize] += 1; + } + + bwtfind +} + +#[cfg(test)] +mod tests { + use super::{bwt, bwtfind, invert_bwt, Occ}; + use crate::alphabets::Alphabet; + use crate::data_structures::suffix_array::suffix_array; + + #[test] + fn test_bwtfind() { + let text = b"cabca$"; + let alphabet = Alphabet::new(b"abc$"); + let pos = suffix_array(text); + let bwt = bwt(text, &pos); + let bwtfind = bwtfind(&bwt, &alphabet); + assert_eq!(bwtfind, vec![5, 0, 3, 4, 1, 2]); + } + + #[test] + fn test_invert_bwt() { + let text = b"cabca$"; + let pos = suffix_array(text); + let bwt = bwt(text, &pos); + let inverse = invert_bwt(&bwt); + assert_eq!(inverse, text); + } + + #[test] + fn test_occ() { + let bwt = vec![1_u8, 3_u8, 3_u8, 1_u8, 2_u8, 0_u8]; + let alphabet = Alphabet::new([0_u8, 1_u8, 2_u8, 3_u8]); + let occ = Occ::new(&bwt, 3, &alphabet); + assert_eq!(occ.occ, [[0, 0], [1, 2], [0, 0], [0, 2]]); + assert_eq!(occ.get(&bwt, 4, 2_u8), 1); + assert_eq!(occ.get(&bwt, 4, 3_u8), 2); + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/data_structures/fmindex.rs b/packages_rs/3rdparty/bio_inline/src/data_structures/fmindex.rs new file mode 100644 index 000000000..e5a486eff --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/data_structures/fmindex.rs @@ -0,0 +1,820 @@ +// Copyright 2014-2016 Johannes Köster, Taylor Cramer. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! The [Full-text index in Minute space index (FM-index)](https://doi.org/10.1109/SFCS.2000.892127) and +//! the FMD-Index for finding suffix array intervals matching a given pattern in linear time. +//! +//! # Examples +//! +//! ## Generate +//! +//! ``` +//! use bio_inline::alphabets::dna; +//! use bio_inline::data_structures::bwt::{bwt, less, Occ}; +//! use bio_inline::data_structures::fmindex::{FMIndex, FMIndexable}; +//! use bio_inline::data_structures::suffix_array::suffix_array; +//! +//! let text = b"GCCTTAACATTATTACGCCTA$"; +//! let alphabet = dna::n_alphabet(); +//! let sa = suffix_array(text); +//! let bwt = bwt(text, &sa); +//! let less = less(&bwt, &alphabet); +//! let occ = Occ::new(&bwt, 3, &alphabet); +//! let fm = FMIndex::new(&bwt, &less, &occ); +//! ``` +//! +//! ## Enclose in struct +//! +//! `FMIndex` was designed to not forcibly own the BWT and auxiliary data structures. +//! It can take a reference (`&`), owned structs or any of the more complex pointer types. +//! +//! ``` +//! use bio_inline::alphabets::dna; +//! use bio_inline::data_structures::bwt::{bwt, less, Less, Occ, BWT}; +//! use bio_inline::data_structures::fmindex::{FMIndex, FMIndexable}; +//! use bio_inline::data_structures::suffix_array::suffix_array; +//! use bio_inline::utils::TextSlice; +//! +//! pub struct Example { +//! fmindex: FMIndex, +//! } +//! +//! impl Example { +//! pub fn new(text: TextSlice) -> Self { +//! let alphabet = dna::n_alphabet(); +//! let sa = suffix_array(text); +//! let bwt = bwt(text, &sa); +//! let less = less(&bwt, &alphabet); +//! let occ = Occ::new(&bwt, 3, &alphabet); +//! let fm = FMIndex::new(bwt, less, occ); +//! Example { fmindex: fm } +//! } +//! } +//! ``` + +use crate::alphabets::dna; +use crate::data_structures::bwt::{Less, Occ, BWT}; +use crate::data_structures::suffix_array::SuffixArray; +use serde::{Deserialize, Serialize}; +use std::borrow::Borrow; +use std::iter::DoubleEndedIterator; +use std::mem::swap; + +/// A suffix array interval. +#[derive(Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct Interval { + pub lower: usize, + pub upper: usize, +} + +impl Interval { + pub fn occ(&self, sa: &SA) -> Vec { + (self.lower..self.upper) + .map(|pos| sa.get(pos).expect("Interval out of range of suffix array")) + .collect() + } +} + +/// This enum represents the potential result states +/// from a backward_search in the fm index. The +/// potential variants of the enum are: +/// Complete(Interval) — the query matched completely. The interval is the +/// range of suffix array indices matching the query string. +/// Partial(Intarval, usize) - some suffix of the query matched, but not the whole query. +/// The interval returned is the range of suffix array indices for the maximal +/// matching suffix, and the `usize` is the length of the maximal matching suffix. +/// Absent - None suffix of the pattern matched in the text. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub enum BackwardSearchResult { + Complete(Interval), + Partial(Interval, usize), + Absent, +} + +pub trait FMIndexable { + /// Get occurrence count of symbol a in BWT[..r+1]. + fn occ(&self, r: usize, a: u8) -> usize; + /// Also known as + fn less(&self, a: u8) -> usize; + fn bwt(&self) -> &BWT; + + /// Perform backward search, yielding `BackwardSearchResult` enum that + /// contains the suffix array interval denoting exact occurrences of the given pattern + /// of length m in the text if it exists, or the suffix array interval denoting the + /// exact occurrences of a maximal matching suffix of the given pattern if it does + /// not exist. If none of the pattern can be matched, the `BackwardSearchResult` is + /// `Absent`. + /// Complexity: O(m). + /// + /// # Arguments + /// + /// * `pattern` - the pattern to search + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets::dna; + /// use bio_inline::data_structures::bwt::{bwt, less, Occ}; + /// use bio_inline::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable}; + /// use bio_inline::data_structures::suffix_array::suffix_array; + /// + /// let text = b"GCCTTAACATTATTACGCCTA$"; + /// let alphabet = dna::n_alphabet(); + /// let sa = suffix_array(text); + /// let bwt = bwt(text, &sa); + /// let less = less(&bwt, &alphabet); + /// let occ = Occ::new(&bwt, 3, &alphabet); + /// let fm = FMIndex::new(&bwt, &less, &occ); + /// + /// let pattern = b"TTA"; + /// let bsr = fm.backward_search(pattern.iter()); + /// + /// let positions = match bsr { + /// BackwardSearchResult::Complete(sai) => sai.occ(&sa), + /// BackwardSearchResult::Partial(sai, _l) => sai.occ(&sa), + /// BackwardSearchResult::Absent => Vec::::new(), + /// }; + /// + /// assert_eq!(positions, [3, 12, 9]); + /// ``` + fn backward_search<'b, P: Iterator + DoubleEndedIterator>(&self, pattern: P) -> BackwardSearchResult { + let (mut l, mut r) = (0, self.bwt().len() - 1); + // to keep track of the last "valid" search interval if + // there is any valid suffix match. + let (mut pl, mut pr) = (l, r); + + // the length of the suffix we have been able to match + // successfully + let mut matched_len = 0; + // track if we exit early or not due to an empty + // search interval. + let mut complete_match = true; + + for &a in pattern.rev() { + let less = self.less(a); + pl = l; + pr = r; + l = less + if l > 0 { self.occ(l - 1, a) } else { 0 }; + r = less + self.occ(r, a) - 1; + + // The symbol was not found if we end up with an empty interval. + // Terminate the LF-mapping process. In this case, also mark that + // we do not have a complete match. + if l > r { + complete_match = false; + break; + } + matched_len += 1; + } + + // if we matched at least 1 character + if matched_len > 0 { + // if we matched the full pattern length we + // have a complete match + if complete_match { + BackwardSearchResult::Complete(Interval { lower: l, upper: r + 1 }) + } else { + // if we matched less than the full pattern length, we have + // a partial suffix match + BackwardSearchResult::Partial( + Interval { + lower: pl, + upper: pr + 1, + }, + matched_len, + ) + } + } else { + // if we matched nothing we have an absent result + BackwardSearchResult::Absent + } + } +} + +/// The Fast Index in Minute space (FM-Index, Ferragina and Manzini, 2000) for finding suffix array +/// intervals matching a given pattern. +#[derive(Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct FMIndex, DLess: Borrow, DOcc: Borrow> { + bwt: DBWT, + less: DLess, + occ: DOcc, +} + +impl, DLess: Borrow, DOcc: Borrow> FMIndexable for FMIndex { + fn occ(&self, r: usize, a: u8) -> usize { + self.occ.borrow().get(self.bwt.borrow(), r, a) + } + fn less(&self, a: u8) -> usize { + self.less.borrow()[a as usize] + } + /// Provide a reference to the underlying BWT. + fn bwt(&self) -> &BWT { + self.bwt.borrow() + } +} + +impl, DLess: Borrow, DOcc: Borrow> FMIndex { + /// Construct a new instance of the FM index. + /// + /// # Arguments + /// + /// * `bwt` - the BWT + /// * `less` - the less array of the BWT + /// * `occ` - the occurence array of the BWT + pub fn new(bwt: DBWT, less: DLess, occ: DOcc) -> Self { + FMIndex { bwt, less, occ } + } +} + +/// A bi-interval on suffix array of the forward and reverse strand of a DNA text. +#[derive(Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct BiInterval { + lower: usize, + lower_rev: usize, + size: usize, + match_size: usize, +} + +impl BiInterval { + pub fn forward(&self) -> Interval { + Interval { + upper: self.lower + self.size, + lower: self.lower, + } + } + pub fn revcomp(&self) -> Interval { + Interval { + upper: self.lower_rev + self.size, + lower: self.lower_rev, + } + } + + fn swapped(&self) -> BiInterval { + BiInterval { + lower: self.lower_rev, + lower_rev: self.lower, + size: self.size, + match_size: self.match_size, + } + } +} + +/// The FMD-Index for linear time search of supermaximal exact matches on forward and reverse +/// strand of DNA texts (Li, 2012). +#[derive(Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct FMDIndex, DLess: Borrow, DOcc: Borrow> { + fmindex: FMIndex, +} + +impl, DLess: Borrow, DOcc: Borrow> FMIndexable for FMDIndex { + fn occ(&self, r: usize, a: u8) -> usize { + self.fmindex.occ(r, a) + } + + fn less(&self, a: u8) -> usize { + self.fmindex.less(a) + } + + /// Provide a reference to the underlying BWT. + fn bwt(&self) -> &BWT { + self.fmindex.bwt() + } +} + +impl, DLess: Borrow, DOcc: Borrow> From> + for FMDIndex +{ + /// Construct a new instance of the FMD index (see Heng Li (2012) Bioinformatics). + /// This expects a BWT that was created from a text over the DNA alphabet with N + /// (`alphabets::dna::n_alphabet()`) consisting of the + /// concatenation with its reverse complement, separated by the sentinel symbol `$`. + /// I.e., let T be the original text and R be its reverse complement. + /// Then, the expected text is T$R$. Further, multiple concatenated texts are allowed, e.g. + /// T1$R1$T2$R2$T3$R3$. + fn from(fmindex: FMIndex) -> FMDIndex { + let mut alphabet = dna::n_alphabet(); + alphabet.insert(b'$'); + assert!( + alphabet.is_word(fmindex.bwt()), + "Expecting BWT over the DNA alphabet (including N) with the sentinel $." + ); + + FMDIndex { fmindex } + } +} + +impl, DLess: Borrow, DOcc: Borrow> FMDIndex { + /// Find supermaximal exact matches (of length >= l) of given pattern that overlap position i in the pattern. + /// Complexity O(m) with pattern of length m. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets::dna; + /// use bio_inline::data_structures::bwt::{bwt, less, Occ}; + /// use bio_inline::data_structures::fmindex::{FMDIndex, FMIndex}; + /// use bio_inline::data_structures::suffix_array::suffix_array; + /// + /// let text = b"ATTC$GAAT$"; + /// let alphabet = dna::n_alphabet(); + /// let sa = suffix_array(text); + /// let bwt = bwt(text, &sa); + /// let less = less(&bwt, &alphabet); + /// let occ = Occ::new(&bwt, 3, &alphabet); + /// let fm = FMIndex::new(&bwt, &less, &occ); + /// let fmdindex = FMDIndex::from(fm); + /// + /// let pattern = b"ATT"; + /// let intervals = fmdindex.smems(pattern, 2, 0); + /// + /// let forward_positions = intervals[0].0.forward().occ(&sa); + /// let revcomp_positions = intervals[0].0.revcomp().occ(&sa); + /// let pattern_position = intervals[0].1; + /// let smem_len = intervals[0].2; + /// + /// assert_eq!(forward_positions, [0]); + /// assert_eq!(revcomp_positions, [6]); + /// assert_eq!(pattern_position, 0); + /// assert_eq!(smem_len, 3); + /// ``` + pub fn smems(&self, pattern: &[u8], i: usize, l: usize) -> Vec<(BiInterval, usize, usize)> { + let curr = &mut Vec::new(); // pairs (biinterval, current match length) + let prev = &mut Vec::new(); // """ + let mut matches = Vec::new(); // triples (biinterval, position on pattern, smem length) + + let mut match_len = 0; + let mut interval = self.init_interval_with(pattern[i]); + if interval.size != 0 { + match_len += 1; + } + + for &a in pattern[i + 1..].iter() { + // forward extend interval + let forward_interval = self.forward_ext(&interval, a); + + // if size changed, add last interval to list + if interval.size != forward_interval.size { + curr.push((interval, match_len)); + } + // if new interval size is zero, stop, as no further forward extension is possible + if forward_interval.size == 0 { + break; + } + interval = forward_interval; + match_len += 1; + } + // add the last non-zero interval + curr.push((interval, match_len)); + // reverse intervals such that longest comes first + curr.reverse(); + + swap(curr, prev); + let mut j = pattern.len() as isize; + + for k in (-1..i as isize).rev() { + let a = if k == -1 { b'$' } else { pattern[k as usize] }; + curr.clear(); + // size of the last confirmed interval + let mut last_size = -1; + + for (interval, match_len) in prev.iter() { + // backward extend interval + let forward_interval = self.backward_ext(interval, a); + + if (forward_interval.size == 0 || k == -1) && + // interval could not be extended further + // if no interval has been extended this iteration, + // interval is maximal and can be added to the matches + curr.is_empty() && k < j && + match_len >= &l + { + j = k; + matches.push((*interval, (k + 1) as usize, *match_len)); + } + // add _interval to curr (will be further extended next iteration) + if forward_interval.size != 0 && forward_interval.size as isize != last_size { + last_size = forward_interval.size as isize; + curr.push((forward_interval, match_len + 1)); + } + } + if curr.is_empty() { + break; + } + swap(curr, prev); + } + + matches + } + + /// Find all supermaximal exact matches (of length >= l) of given pattern. + /// Complexity O(m^2) with pattern of length m. + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets::dna; + /// use bio_inline::data_structures::bwt::{bwt, less, Occ}; + /// use bio_inline::data_structures::fmindex::{FMDIndex, FMIndex}; + /// use bio_inline::data_structures::suffix_array::suffix_array; + /// + /// let text = b"ATTCGGGG$CCCCGAAT$"; + /// let alphabet = dna::n_alphabet(); + /// let sa = suffix_array(text); + /// let bwt = bwt(text, &sa); + /// let less = less(&bwt, &alphabet); + /// let occ = Occ::new(&bwt, 3, &alphabet); + /// let fm = FMIndex::new(&bwt, &less, &occ); + /// let fmdindex = FMDIndex::from(fm); + /// + /// let pattern = b"ATTGGGG"; + /// let intervals = fmdindex.all_smems(pattern, 0); + /// assert_eq!(intervals.len(), 2); + /// + /// let solutions = vec![[0, 14, 0, 3], [4, 9, 3, 4]]; + /// for (i, interval) in intervals.iter().enumerate() { + /// let forward_positions = interval.0.forward().occ(&sa); + /// let revcomp_positions = interval.0.revcomp().occ(&sa); + /// let pattern_position = interval.1; + /// let smem_len = interval.2; + /// assert_eq!( + /// [ + /// forward_positions[0], + /// revcomp_positions[0], + /// pattern_position, + /// smem_len + /// ], + /// solutions[i] + /// ); + /// } + /// ``` + pub fn all_smems(&self, pattern: &[u8], l: usize) -> Vec<(BiInterval, usize, usize)> { + let mut smems = Vec::new(); + let mut i0 = 0; + while i0 < pattern.len() { + let mut curr_smems = self.smems(pattern, i0, l); + let mut next_i0 = i0 + 1; // this always works since: + // if we have a SMEM overlapping i0, it is at least 1bp long. + // If we don't have a smem, then we'll reiterate from i0+1 + for (_, p, l) in &curr_smems { + if p + l > next_i0 { + next_i0 = p + l; + } + } + i0 = next_i0; + smems.append(&mut curr_smems); + } + smems + } + + /// Initialize interval with given start character. + pub fn init_interval_with(&self, a: u8) -> BiInterval { + let comp_a = dna::complement(a); + let lower = self.fmindex.less(a); + + BiInterval { + lower, + lower_rev: self.fmindex.less(comp_a), + size: self.fmindex.less(a + 1) - lower, + match_size: 1, + } + } + + /// Initialize interval for empty pattern. The interval points at the whole suffix array. + pub fn init_interval(&self) -> BiInterval { + BiInterval { + lower: 0, + lower_rev: 0, + size: self.fmindex.bwt.borrow().len(), + match_size: 0, + } + } + + /// Backward extension of given interval with given character. + pub fn backward_ext(&self, interval: &BiInterval, a: u8) -> BiInterval { + let mut s = 0; + let mut o = 0; + let mut l = interval.lower_rev; + // Interval [l(c(aP)), u(c(aP))] is a subinterval of [l(c(P)), u(c(P))] for each a, + // starting with the lexicographically smallest ($), + // then c(T) = A, c(G) = C, c(C) = G, N, c(A) = T, ... + // Hence, we calculate lower revcomp bounds by iterating over + // symbols and updating from previous one. + for &b in b"$TGCNAtgcna".iter() { + l += s; + o = if interval.lower == 0 { + 0 + } else { + self.fmindex.occ(interval.lower - 1, b) + }; + // calculate size + s = self.fmindex.occ(interval.lower + interval.size - 1, b) - o; + if b == a { + break; + } + } + // calculate lower bound + let k = self.fmindex.less(a) + o; + + BiInterval { + lower: k, + lower_rev: l, + size: s, + match_size: interval.match_size + 1, + } + } + + pub fn forward_ext(&self, interval: &BiInterval, a: u8) -> BiInterval { + let comp_a = dna::complement(a); + + self.backward_ext(&interval.swapped(), comp_a).swapped() + } + + /// Construct a new instance of the FMD index (see Heng Li (2012) Bioinformatics) + /// without checking whether the text is over the DNA alphabet with N. + /// This expects a BWT that was created from a text over the DNA alphabet with N + /// (`alphabets::dna::n_alphabet()`) consisting of the + /// concatenation with its reverse complement, separated by the sentinel symbol `$`. + /// I.e., let T be the original text and R be its reverse complement. + /// Then, the expected text is T$R$. Further, multiple concatenated texts are allowed, e.g. + /// T1$R1$T2$R2$T3$R3$. + /// It is unsafe to construct an FMD index from an FM index that is not built on the DNA alphabet. + pub unsafe fn from_fmindex_unchecked(fmindex: FMIndex) -> FMDIndex { + FMDIndex { fmindex } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::alphabets::dna; + use crate::data_structures::bwt::{bwt, less, Occ}; + use crate::data_structures::suffix_array::suffix_array; + + #[test] + fn test_fmindex() { + let text = b"GCCTTAACATTATTACGCCTA$"; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + + let pattern = b"TTA"; + let sai = fm.backward_search(pattern.iter()); + + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; + + assert_eq!(positions, [3, 12, 9]); + } + + #[test] + fn test_fmindex_not_found() { + let text = b"TCCTTAACATTATTACTCCTA$"; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + + let pattern = b"TTG"; + let sai = fm.backward_search(pattern.iter()); + + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; + + assert_eq!(positions, []); + } + + #[test] + fn test_fmindex_backward_search_optimization() { + let text = b"GATTACA$"; + let pattern = &text[..text.len() - 1]; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + + let sai = fm.backward_search(pattern.iter()); + + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, _l) => saint.occ(&sa), + BackwardSearchResult::Absent => Vec::::new(), + }; + + assert_eq!(positions, [0]); + } + + #[test] + fn test_fmindex_backward_search_partial_match() { + let text = b"GATTACA$"; + let pattern = b"GTACA"; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + + let sai = fm.backward_search(pattern.iter()); + + let mut partial_match_len = 0; + let positions = match sai { + BackwardSearchResult::Complete(saint) => saint.occ(&sa), + BackwardSearchResult::Partial(saint, l) => { + partial_match_len = l; + saint.occ(&sa) + } + BackwardSearchResult::Absent => Vec::::new(), + }; + + assert_eq!(partial_match_len, 4); + assert_eq!(positions, [3]); + } + + #[test] + fn test_smems() { + let orig_text = b"GCCTTAACAT"; + let revcomp_text = dna::revcomp(orig_text); + let text_builder: Vec<&[u8]> = vec![orig_text, b"$", &revcomp_text[..], b"$"]; + let text = text_builder.concat(); + + let alphabet = dna::n_alphabet(); + let sa = suffix_array(&text); + let bwt = bwt(&text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + + let fmindex = FMIndex::new(&bwt, &less, &occ); + let fmdindex = FMDIndex::from(fmindex); + { + let pattern = b"AA"; + let intervals = fmdindex.smems(pattern, 0, 0); + let forward = intervals[0].0.forward(); + let revcomp = intervals[0].0.revcomp(); + let pattern_position = intervals[0].1; + let smem_len = intervals[0].2; + assert_eq!(forward.occ(&sa), [5, 16]); + assert_eq!(revcomp.occ(&sa), [3, 14]); + assert_eq!(pattern_position, 0); + assert_eq!(smem_len, 2); + } + { + let pattern = b"CTTAA"; + let intervals = fmdindex.smems(pattern, 1, 0); + assert_eq!(intervals[0].0.forward().occ(&sa), [2]); + assert_eq!(intervals[0].0.revcomp().occ(&sa), [14]); + assert_eq!(intervals[0].1, 0); + assert_eq!(intervals[0].2, 5); + assert_eq!(intervals[0].0.match_size, 5); + } + { + let pattern = b"CTTAA"; + let intervals = fmdindex.smems(pattern, 1, 7); + assert!(intervals.is_empty()); + } + } + + #[test] + fn test_all_smems() { + let text = b"ATTCGGGG$CCCCGAAT$"; + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let fm = FMIndex::new(&bwt, &less, &occ); + let fmdindex = FMDIndex::from(fm); + + { + let pattern = b"ATTGGGG"; + let intervals = fmdindex.all_smems(pattern, 0); + assert_eq!(intervals.len(), 2); + let solutions = vec![[0, 14, 0, 3], [4, 9, 3, 4]]; + for (i, interval) in intervals.iter().enumerate() { + let forward_positions = interval.0.forward().occ(&sa); + let revcomp_positions = interval.0.revcomp().occ(&sa); + let pattern_position = interval.1; + let smem_len = interval.2; + assert_eq!( + [forward_positions[0], revcomp_positions[0], pattern_position, smem_len], + solutions[i] + ); + } + } + } + + #[test] + fn test_init_interval() { + let text = b"ACGT$TGCA$"; + + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + + let fmindex = FMIndex::new(&bwt, &less, &occ); + let fmdindex = FMDIndex::from(fmindex); + let pattern = b"T"; + let interval = fmdindex.init_interval_with(pattern[0]); + + assert_eq!(interval.forward().occ(&sa), [3, 5]); + assert_eq!(interval.revcomp().occ(&sa), [8, 0]); + + let empty = fmdindex.init_interval(); + let extended = fmdindex.backward_ext(&empty, pattern[0]); + assert_eq!(extended, interval); + let extended = fmdindex.forward_ext(&empty, pattern[0]); + assert_eq!(extended, interval); + } + + #[test] + fn test_issue39() { + let reads = b"GGCGTGGTGGCTTATGCCTGTAATCCCAGCACTTTGGGAGGTCGAAGTGGGCGG$CCGC\ + CCACTTCGACCTCCCAAAGTGCTGGGATTACAGGCATAAGCCACCACGCC$CGAAGTGG\ + GCGGATCACTTGAGGTCAGGAGTTGGAGACTAGCCTGGCCAACACGATGAAACCCCGTC\ + TCTAATA$TATTAGAGACGGGGTTTCATCGTGTTGGCCAGGCTAGTCTCCAACTCCTGA\ + CCTCAAGTGATCCGCCCACTTCG$AGCTCGAAAAATGTTTGCTTATTTTGGTAAAATTA\ + TTCATTGACTATGCTCAGAAATCAAGCAAACTGTCCATATTTCATTTTTTG$CAAAAAA\ + TGAAATATGGACAGTTTGCTTGATTTCTGAGCATAGTCAATGAATAATTTTACCAAAAT\ + AAGCAAACATTTTTCGAGCT$AGCTCGAAAAATGTTTGCTTATTTTGGTAAAATTATTC\ + ATTGACTATGCTCAGAAATCAAGCAAACTGTCCATATTTCATTTTTTGAAATTACATAT\ + $ATATGTAATTTCAAAAAATGAAATATGGACAGTTTGCTTGATTTCTGAGCATAGTCAA\ + TGAATAATTTTACCAAAATAAGCAAACATTTTTCGAGCT$TAAAATTTCCTCTGACAGT\ + GTAAAAGAGATCTTCATACAAAAATCAGAATTTATATAGTCTCTTTCCAAAAGACCATA\ + AAACCAATCAGTTAATAGTTGAT$ATCAACTATTAACTGATTGGTTTTATGGTCTTTTG\ + GAAAGAGACTATATAAATTCTGATTTTTGTATGAAGATCTCTTTTACACTGTCAGAGGA\ + AATTTTA$CACCTATCTACCCTGAATCTAAGTGCTAACAGGAAAGGATGCCAGATTGCA\ + TGCCTGCTGATAAAGCCACAGTTTGGACTGTCACTCAATCACCATCGTTC$GAACGATG\ + GTGATTGAGTGACAGTCCAAACTGTGGCTTTATCAGCAGGCATGCAATCTGGCATCCTT\ + TCCTGTTAGCACTTAGATTCAGGGTAGATAGGTG$CATCGTTCCTCCTGTGACTCAGTA\ + TAACAAGATTGGGAGAATACTCTACAGTTCCTGATTCCCCCACAG$CTGTGGGGGAATC\ + AGGAACTGTAGAGTATTCTCCCAATCTTGTTATACTGAGTCACAGGAGGAACGATG$TG\ + TAAATTCTGAGAAAAATTTGCAGGTCTTTCTTCAGGAGCATGTAATCTCTTGCTCTCTT\ + TGTTATCTATCTATAGTACTGTAGGTTATCTGGAGTTGCT$AGCAACTCCAGATAACCT\ + ACAGTACTATAGATAGATAACAAAGAGAGCAAGAGATTACATGCTCCTGAAGAAAGACC\ + TGCAAATTTTTCTCAGAATTTACA$CACTTCTCCTTGTCTTTACAGACTGGTTTTGCAC\ + TGGGAAATCCTTTCACCAGTCAGCCCAGTTAGAGATTCTG$CAGAATCTCTAACTGGGC\ + TGACTGGTGAAAGGATTTCCCAGTGCAAAACCAGTCTGTAAAGACAAGGAGAAGTG$AA\ + TGGAGGTATATAAATTATCTGGCAAAGTGACATATCCTGACACATTCTCCAGGATAGAT\ + CAAATGTTAGGTCACAAAGAGAGTCTTAACAAAATT$AATTTTGTTAAGACTCTCTTTG\ + TGACCTAACATTTGATCTATCCTGGAGAATGTGTCAGGATATGTCACTTTGCCAGATAA\ + TTTATATACCTCCATT$TTAATTTTGTTAAGACTCTCTTTGTGACCTAACATTTGATCT\ + ATCCTGGAGAATGTGTCAGGATATGTCACTTTGCCAGATAATTTATATACCTCCATTTT\ + $AAAATGGAGGTATATAAATTATCTGGCAAAGTGACATATCCTGACACATTCTCCAGGA\ + TAGATCAAATGTTAGGTCACAAAGAGAGTCTTAACAAAATTAA$TTCTTCTTTGACTCA\ + TTGGTTGTTCAATAGTATGTTGTTTAATTTCCATATATTTGTAAATGTTTCCGTTTTCC\ + TTCTACTATTGAATTTTTGCTTCATC$GATGAAGCAAAAATTCAATAGTAGAAGGAAAA\ + CGGAAACATTTACAAATATATGGAAATTAAACAACATACTATTGAACAACCAATGAGTC\ + AAAGAAGAA$AGGAAAACGGAAACATTTACAAATATATGGAAATTAAACAACATACTAT\ + TGAACAACCAATGAGTCAAAGAAGAAATCAAAAAGAATATTAGAAAAC$GTTTTCTAAT\ + ATTCTTTTTGATTTCTTCTTTGACTCATTGGTTGTTCAATAGTATGTTGTTTAATTTCC\ + ATATATTTGTAAATGTTTCCGTTTTCCT$TTAGAAAACAAGCTGACAAAAAAATAAAAA\ + AACACAACATAGCAAAACTTAGAAATGCAGCAAAGGCAGTACTAAAGAGGGAAATTTAT\ + AGCAATAAATGC$GCATTTATTGCTATAAATTTCCCTCTTTAGTACTGCCTTTGCTGCA\ + TTTCTAAGTTTTGCTATGTTGTGTTTTTTTATTTTTTTGTCAGCTTGTTTTCTAA$TTT\ + ATTGCTATAAATTTCCCTCTTTAGTACTGCCTTTGCTGCATTTCTAAGTTTTGCTATGT\ + TGTGTTTTTTTATTTTTTTGTCAGCTTGTTTTCTA$TAGAAAACAAGCTGACAAAAAAA\ + TAAAAAAACACAACATAGCAAAACTTAGAAATGCAGCAAAGGCAGTACTAAAGAGGGAA\ + ATTTATAGCAATAAA$TCTTTCTTCTTTTTTAAGGTAGGCATTTATTGCTATAAATTTC\ + CCTCTTTAGTACTGCCTTTG$CAAAGGCAGTACTAAAGAGGGAAATTTATAGCAATAAA\ + TGCCTACCTTAAAAAAGAAGAAAGA$"; + + let alphabet = dna::n_alphabet(); + let sa = suffix_array(reads); + let bwt = bwt(reads, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + + let fmindex = FMIndex::new(&bwt, &less, &occ); + let fmdindex = FMDIndex::from(fmindex); + + let read = b"GGCGTGGTGGCTTATGCCTGTAATCCCAGCACTTTGGGAGGTCGAAGTGGGCGG"; + let read_pos = 0; + + for i in 0..read.len() { + println!("i {i}"); + let intervals = fmdindex.smems(read, i, 0); + println!("{intervals:?}"); + let matches = intervals + .iter() + .flat_map(|interval| interval.0.forward().occ(&sa)) + .collect::>(); + assert_eq!(matches, vec![read_pos]); + } + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/data_structures/mod.rs b/packages_rs/3rdparty/bio_inline/src/data_structures/mod.rs new file mode 100644 index 000000000..14869ad1a --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/data_structures/mod.rs @@ -0,0 +1,11 @@ +// Copyright 2014-2016 Johannes Köster. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! Various useful data structures. + +pub mod bwt; +pub mod fmindex; +pub mod smallints; +pub mod suffix_array; diff --git a/packages_rs/3rdparty/bio_inline/src/data_structures/smallints.rs b/packages_rs/3rdparty/bio_inline/src/data_structures/smallints.rs new file mode 100644 index 000000000..a185ac9a7 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/data_structures/smallints.rs @@ -0,0 +1,196 @@ +// Copyright 2014-2016 Johannes Köster. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! A data structure for a sequence of small integers with a few big integers. +//! Small ints are stored in type S (e.g. a byte), big ints are stored separately (in type `B`) in a BTree. +//! The implementation provides vector-like operations on the data structure (e.g. retrieve a position, +//! add an integer, etc.). Getting and setting (by position) time complexity is `O(1)` for small ints, +//! and `O(b)` for big ints, where `b` is the number of big ints stored. +//! +//! # Space usage +//! SmallInts pay the cost of slower retrieval of big integers with smaller overall memory usage; +//! `O(size_of(S) * (s+b) + size_of(B) * b)` where `S` and `B` are the small and large int types, and +//! `s` and `b` are the number of those stored respectively. +//! +//! # Example +//! +//! ``` +//! use bio::data_structures::smallints::SmallInts; +//! let mut smallints: SmallInts = SmallInts::new(); +//! smallints.push(3); +//! smallints.push(4); +//! smallints.push(255); +//! smallints.push(305093); +//! assert_eq!(smallints.get(0).unwrap(), 3); +//! smallints.set(0, 50000); +//! let values: Vec = smallints.iter().collect(); +//! assert_eq!(values, [50000, 4, 255, 305093]); +//! ``` + +use num_integer::Integer; +use num_traits::{cast, Bounded, Num, NumCast}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::iter::{repeat, Enumerate}; +use std::mem::size_of; +use std::slice; + +/// Data structure for storing a sequence of small integers with few big ones space efficiently +/// while supporting classical vector operations. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)] +pub struct SmallInts { + smallints: Vec, + bigints: BTreeMap, +} + +impl Default for SmallInts { + fn default() -> Self { + assert!(size_of::() < size_of::(), "S has to be smaller than B"); + SmallInts { + smallints: Vec::new(), + bigints: BTreeMap::new(), + } + } +} + +impl SmallInts { + /// Create a new instance. + pub fn new() -> Self { + Self::default() + } + + /// Create a new instance with a given capacity. + pub fn with_capacity(n: usize) -> Self { + assert!(size_of::() < size_of::(), "S has to be smaller than B"); + SmallInts { + smallints: Vec::with_capacity(n), + bigints: BTreeMap::new(), + } + } + + /// Create a new instance containing `n` times the integer `v` (and `v` is expected to be small). + pub fn from_elem(v: S, n: usize) -> Self { + assert!(size_of::() < size_of::(), "S has to be smaller than B"); + if v > cast(0).unwrap() { + assert!(v < S::max_value(), "v has to be smaller than maximum value"); + } + + SmallInts { + smallints: repeat(v).take(n).collect(), + bigints: BTreeMap::new(), + } + } + + /// Return the integer at position `i`. Time complexity `O(1)` if `i` points to a small int, + /// `O(log(b))` for a big int, where `b` denotes the number of big ints stored. + pub fn get(&self, i: usize) -> Option { + if i < self.smallints.len() { + self.real_value(i, self.smallints[i]) + } else { + None + } + } + + /// Append `v` to the sequence. This will determine whether `v` is big or small and store it + /// accordingly. Time complexity `O(1)` for small ints and `O(log(b))` for big ints, + /// where `b` denotes the number of big ints stored. + pub fn push(&mut self, v: B) { + let maxv: S = S::max_value(); + match cast(v) { + Some(v) if v < maxv => self.smallints.push(v), + _ => { + let i = self.smallints.len(); + self.smallints.push(maxv); + self.bigints.insert(i, v); + } + } + } + + /// Set value of position `i` to `v`. This will determine whether `v` is big or small and store it accordingly. + /// Time complexity `O(1)` for small ints and `O(log(b))` for big ints, + /// where `b` denotes the number of big ints stored. + pub fn set(&mut self, i: usize, v: B) { + let maxv: S = S::max_value(); + match cast(v) { + Some(v) if v < maxv => self.smallints[i] = v, + _ => { + self.smallints[i] = maxv; + self.bigints.insert(i, v); + } + } + } + + /// Iterate over sequence. Values will be returned in the big integer type (`B`). + pub fn iter(&self) -> Iter<'_, S, B> { + Iter { + smallints: self, + items: self.smallints.iter().enumerate(), + } + } + + /// Decompress into a normal vector of big integers (type `B`). + pub fn decompress(&self) -> Vec { + self.iter().collect() + } + + /// Length of the sequence. + pub fn len(&self) -> usize { + self.smallints.len() + } + + /// is the sequence empty? + pub fn is_empty(&self) -> bool { + self.smallints.is_empty() + } + + fn real_value(&self, i: usize, v: S) -> Option { + if v < S::max_value() { + cast(v) + } else { + self.bigints.get(&i).copied() + } + } +} + +/// Iterator over the elements of a `SmallInts` sequence. +#[derive(Clone, Debug)] +pub struct Iter<'a, S, B> +where + S: Integer + Bounded + NumCast + Copy, + B: Integer + NumCast + Copy, + ::FromStrRadixErr: 'a, + ::FromStrRadixErr: 'a, +{ + smallints: &'a SmallInts, + items: Enumerate>, +} + +impl<'a, S, B> Iterator for Iter<'a, S, B> +where + S: 'a + Integer + Bounded + NumCast + Copy, + B: 'a + Integer + NumCast + Copy, + ::FromStrRadixErr: 'a, + ::FromStrRadixErr: 'a, +{ + type Item = B; + + fn next(&mut self) -> Option { + match self.items.next() { + Some((i, &v)) => self.smallints.real_value(i, v), + None => None, + } + } +} + +#[cfg(tests)] +mod tests { + #[test] + fn test_serde() { + use serde::{Deserialize, Serialize}; + fn impls_serde_traits() {} + + impls_serde_traits::>(); + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/data_structures/suffix_array.rs b/packages_rs/3rdparty/bio_inline/src/data_structures/suffix_array.rs new file mode 100644 index 000000000..8e133a283 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/data_structures/suffix_array.rs @@ -0,0 +1,919 @@ +// Copyright 2014-2016 Johannes Köster, Taylor Cramer. +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! Suffix arrays and related algorithms. +//! The implementation is based on the lecture notes +//! "Algorithmen auf Sequenzen", Kopczynski, Marschall, Martin and Rahmann, 2008 - 2015. +//! The original algorithm desciption can be found in: +//! [Ge Nong, Sen Zhang, Wai Hong Chan: Two Efficient Algorithms for Linear Time Suffix Array Construction. IEEE Trans. Computers 60(10): 1471–1484 (2011)](https://doi.org/10.1109/TC.2010.188) +//! +//! # Examples +//! +//! ``` +//! use bio_inline::data_structures::suffix_array::suffix_array; +//! let text = b"GCCTTAACATTATTACGCCTA$"; +//! let pos = suffix_array(text); +//! assert_eq!( +//! pos, +//! vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9] +//! ); +//! ``` + +use crate::alphabets::{Alphabet, RankTransform}; +use crate::data_structures::bwt::{Less, Occ, BWT}; +use crate::data_structures::smallints::SmallInts; +use bv::{BitVec, Bits, BitsMut}; +use fxhash::FxHasher; +use num_integer::Integer; +use num_traits::{cast, NumCast, Unsigned}; +use serde::{Deserialize, Serialize}; +use std::borrow::Borrow; +use std::cmp; +use std::collections::HashMap; +use std::fmt::Debug; +use std::hash::BuildHasherDefault; +use std::iter; +use std::ops::Deref; +use vec_map::VecMap; + +pub type LCPArray = SmallInts; +pub type RawSuffixArray = Vec; +pub type RawSuffixArraySlice<'a> = &'a [usize]; + +type HashMapFx = HashMap>; + +/// A trait exposing general functionality of suffix arrays. +pub trait SuffixArray { + fn get(&self, index: usize) -> Option; + fn len(&self) -> usize; + fn is_empty(&self) -> bool; + + /// Sample the suffix array with the given sample rate. + /// + /// # Arguments + /// + /// * `text` - text that the suffix array is built on + /// * `bwt` - the corresponding BWT + /// * `less` - the corresponding less array + /// * `occ` - the corresponding occ table + /// * `sampling_rate` - if sampling rate is k, every k-th entry will be kept + /// + /// # Example + /// + /// ``` + /// use bio_inline::alphabets::dna; + /// use bio_inline::data_structures::bwt::{bwt, less, Occ}; + /// use bio_inline::data_structures::suffix_array::{suffix_array, SuffixArray}; + /// + /// let text = b"ACGCGAT$"; + /// let alphabet = dna::n_alphabet(); + /// let sa = suffix_array(text); + /// let bwt = bwt(text, &sa); + /// let less = less(&bwt, &alphabet); + /// let occ = Occ::new(&bwt, 3, &alphabet); + /// let sampled = sa.sample(text, &bwt, &less, &occ, 2); + /// + /// for i in 0..sa.len() { + /// assert_eq!(sa.get(i), sampled.get(i)); + /// } + /// ``` + fn sample, DLess: Borrow, DOcc: Borrow>( + &self, + text: &[u8], + bwt: DBWT, + less: DLess, + occ: DOcc, + sampling_rate: usize, + ) -> SampledSuffixArray { + let mut sample = Vec::with_capacity((self.len() as f32 / sampling_rate as f32).ceil() as usize); + let mut extra_rows = HashMapFx::default(); + let sentinel = sentinel(text); + + for i in 0..self.len() { + let idx = self.get(i).unwrap(); + if (i % sampling_rate) == 0 { + sample.push(idx); + } else if bwt.borrow()[i] == sentinel { + // If bwt lookup will return a sentinel + // Text suffixes that begin right after a sentinel are always saved as extra rows + // to help deal with FM index last to front inaccuracy when there are many sentinels + extra_rows.insert(i, idx); + } + } + + SampledSuffixArray { + bwt, + less, + occ, + sample, + s: sampling_rate, + extra_rows, + sentinel, + } + } +} + +/// A sampled suffix array. +#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)] +pub struct SampledSuffixArray, DLess: Borrow, DOcc: Borrow> { + bwt: DBWT, + less: DLess, + occ: DOcc, + sample: Vec, + s: usize, // Rate of sampling + extra_rows: HashMapFx, + sentinel: u8, +} + +impl SuffixArray for RawSuffixArray { + fn get(&self, index: usize) -> Option { + // Explicitly written out because Vec::get(index) generates a recursion warning + if index < self.len() { + Some(self[index]) + } else { + None + } + } + + fn len(&self) -> usize { + Vec::len(self) + } + + fn is_empty(&self) -> bool { + Vec::is_empty(self) + } +} + +impl, DLess: Borrow, DOcc: Borrow> SuffixArray for SampledSuffixArray { + fn get(&self, index: usize) -> Option { + if index < self.len() { + let mut pos = index; + let mut offset = 0; + loop { + if pos % self.s == 0 { + return Some(self.sample[pos / self.s] + offset); + } + + let c = self.bwt.borrow()[pos]; + + if c == self.sentinel { + // Check if next character in the bwt is the sentinel + // If so, there must be a cached result to workaround FM index last to front + // mapping inaccuracy when there are multiple sentinels + // This branch should rarely be triggered so the performance impact + // of hashmap lookups would be low + return Some(self.extra_rows[&pos] + offset); + } + + pos = self.less.borrow()[c as usize] + self.occ.borrow().get(self.bwt.borrow(), pos - 1, c); + offset += 1; + } + } else { + None + } + } + + fn len(&self) -> usize { + self.bwt.borrow().len() + } + + fn is_empty(&self) -> bool { + self.bwt.borrow().is_empty() + } +} + +impl, DLess: Borrow, DOcc: Borrow> SampledSuffixArray { + /// Get the sampling rate of the suffix array. + pub fn sampling_rate(&self) -> usize { + self.s + } + + /// Get a reference to the internal BWT. + pub fn bwt(&self) -> &BWT { + self.bwt.borrow() + } + + /// Get a reference to the internal Less. + pub fn less(&self) -> &Less { + self.less.borrow() + } + + /// Get a reference to the internal Occ. + pub fn occ(&self) -> &Occ { + self.occ.borrow() + } +} + +/// Construct suffix array for given text of length n. +/// Complexity: O(n). +/// This is an implementation of the induced sorting as presented by +/// Ge Nong, Sen Zhang und Wai Hong Chan (2009), also known as SAIS. +/// The implementation is based on the following lecture notes: +/// http://ls11-www.cs.tu-dortmund.de/people/rahmann/algoseq.pdf +/// +/// The idea is to first mark positions as L or S, with L being a position +/// the suffix of which is lexicographically larger than that of the next position. +/// Then, LMS-positions (leftmost S) are S-positions right to an L-position. +/// An LMS substring is the substring from one LMS position to the next (inclusive). +/// The algorithm works as follows: +/// +/// 1. Sort LMS positions: first step 2 is applied to the unsorted sequence +/// of positions. Surprisingly, this sorts the LMS substrings. If all substrings +/// are different, LMS positions (and their suffixes) are sorted. Else, a reduced +/// text is build (at most half the size of the original text) and we recurse into +/// suffix array construction on the reduced text, yielding the sorted LMS positions. +/// 2. Derive the order of the other positions/suffixes from the (sorted) LMS positions. +/// For this, the (still empty) suffix array is partitioned into buckets. +/// Each bucket denotes an interval of suffixes with the same first symbol. +/// We know that the L-suffixes have to occur first in the buckets, because they +/// have to be lexicographically smaller than the S-suffixes with the same first letter. +/// The LMS-positions can now be used to insert the L-positions in the correct order +/// into the buckets. +/// Then, the S-positions can be inserted, again using the already existing entries +/// in the array. +/// +/// # Arguments +/// +/// * `text` - the text, ended by sentinel symbol (being lexicographically smallest). The text may +/// also contain multiple sentinel symbols, used to concatenate multiple sequences without mixing +/// their suffixes together. +/// +/// # Example +/// +/// ``` +/// use bio_inline::data_structures::suffix_array::suffix_array; +/// let text = b"GCCTTAACATTATTACGCCTA$"; +/// let pos = suffix_array(text); +/// assert_eq!( +/// pos, +/// vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9] +/// ); +/// ``` +pub fn suffix_array(text: &[u8]) -> RawSuffixArray { + let n = text.len(); + let alphabet = Alphabet::new(text); + let sentinel_count = sentinel_count(text); + let mut sais = Sais::new(n); + + match alphabet.len() + sentinel_count { + a if u8::try_from(a).is_ok() => sais.construct(&transform_text::(text, &alphabet, sentinel_count)), + a if u16::try_from(a).is_ok() => sais.construct(&transform_text::(text, &alphabet, sentinel_count)), + a if u32::try_from(a).is_ok() => sais.construct(&transform_text::(text, &alphabet, sentinel_count)), + _ => sais.construct(&transform_text::(text, &alphabet, sentinel_count)), + } + + sais.pos +} + +/// Construct lcp array for given text and suffix array of length n. +/// Complexity: O(n). +/// +/// # Arguments +/// +/// * `text` - the text ended by sentinel symbol (being lexicographically smallest) +/// * `pos` - the suffix array for the text +/// +/// # Example +/// +/// ``` +/// use bio_inline::data_structures::suffix_array::{lcp, suffix_array}; +/// let text = b"GCCTTAACATTATTACGCCTA$"; +/// let pos = suffix_array(text); +/// +/// // obtain compressed LCP array +/// let lcp = lcp(text, &pos); +/// +/// // get most values in O(1). +/// assert_eq!(lcp.get(6).unwrap(), 4); +/// +/// // obtain uncompressed LCP array. +/// let uncompressed = lcp.decompress(); +/// assert_eq!( +/// uncompressed, +/// [-1, 0, 1, 1, 2, 1, 4, 0, 1, 3, 1, 1, 2, 0, 4, 0, 2, 2, 2, 1, 3, 3, -1] +/// ) +/// ``` +pub fn lcp>(text: &[u8], pos: SA) -> LCPArray { + assert_eq!(text.len(), pos.len()); + let n = text.len(); + + // provide the lexicographical rank for each suffix + let mut rank: Vec = iter::repeat(0).take(n).collect(); + for (r, p) in pos.iter().enumerate() { + rank[*p] = r; + } + + let mut lcp = SmallInts::from_elem(-1, n + 1); + let mut l = 0_usize; + for (p, &r) in rank.iter().enumerate().take(n - 1) { + // since the sentinel has rank 0 and is excluded above, + // we will never have a negative index below + let pred = pos[r - 1]; + while pred + l < n && p + l < n && text[p + l] == text[pred + l] { + l += 1; + } + lcp.set(r, l as isize); + l = if l > 0 { l - 1 } else { 0 }; + } + + lcp +} + +/// Calculate all locally shortest unique substrings from a given suffix and lcp array +/// (Ohlebusch (2013). "Bioinformatics Algorithms". ISBN 978-3-00-041316-2). +/// Complexity: O(n) +/// +/// # Arguments +/// +/// * `pos` - the suffix array +/// * `lcp` - the lcp array +/// +/// # Returns +/// +/// An vector of the length of the shortest unique substring for each position of the text. +/// Suffixes are excluded. If no unique substring starts at a given position, the entry is `None`. +/// +/// # Example +/// +/// ``` +/// use bio_inline::data_structures::suffix_array::{lcp, shortest_unique_substrings, suffix_array}; +/// let text = b"GCTGCTA$"; +/// let pos = suffix_array(text); +/// +/// // obtain compressed LCP array +/// let lcp = lcp(text, &pos); +/// +/// // calculate shortest unique substrings +/// let sus = shortest_unique_substrings(&pos, &lcp); +/// assert_eq!( +/// sus, +/// [ +/// Some(4), +/// Some(3), +/// Some(2), +/// Some(4), +/// Some(3), +/// Some(2), +/// Some(1), +/// Some(1) +/// ] +/// ); +/// ``` +pub fn shortest_unique_substrings(pos: &SA, lcp: &LCPArray) -> Vec> { + let n = pos.len(); + // Initialize array representing the length of the shortest unique substring starting at position i + let mut sus = vec![None; n]; + for i in 0..n { + // The longest common prefixes (LCP) of suffix pos[i] with its predecessor and successor are not unique. + // In turn the their maximum + 1 is the length of the shortest unique substring starting at pos[i]. + let len = 1 + cmp::max(lcp.get(i).unwrap(), lcp.get(i + 1).unwrap_or(0)) as usize; + let p = pos.get(i).unwrap(); + // Check if the suffix pos[i] is a prefix of pos[i+1]. In that case, there is no unique substring + // at this position. + if n - p >= len { + sus[p] = Some(len); + } + } + sus +} + +/// Return last character of the text (expected to be the sentinel). +fn sentinel(text: &[u8]) -> u8 { + text[text.len() - 1] +} + +/// Count the sentinels occurring in the text given that the last character is the sentinel. +fn sentinel_count(text: &[u8]) -> usize { + let sentinel = sentinel(text); + assert!( + text.iter().all(|&a| a >= sentinel), + "Expecting extra sentinel symbol being lexicographically smallest at the end of the \ + text." + ); + + text.iter().fold(0, |count, &a| count + (a == sentinel) as usize) +} + +/// Transform the given text into integers for usage in `SAIS`. +fn transform_text( + text: &[u8], + alphabet: &Alphabet, + sentinel_count: usize, +) -> Vec { + let sentinel = sentinel(text); + let transform = RankTransform::new(alphabet); + let offset = sentinel_count - 1; + + let mut transformed: Vec = Vec::with_capacity(text.len()); + let mut s = sentinel_count; + for &a in text.iter() { + if a == sentinel { + s -= 1; + transformed.push(cast(s).unwrap()); + } else { + transformed.push(cast(*(transform.ranks.get(a as usize)).unwrap() as usize + offset).unwrap()); + } + } + + transformed +} + +/// SAIS implementation (see function `suffix_array` for description). +struct Sais { + pos: Vec, + lms_pos: Vec, + reduced_text_pos: Vec, + bucket_sizes: VecMap, + bucket_start: Vec, + bucket_end: Vec, +} + +impl Sais { + /// Create a new instance. + fn new(n: usize) -> Self { + Sais { + pos: Vec::with_capacity(n), + lms_pos: Vec::with_capacity(n), + reduced_text_pos: vec![0; n], + bucket_sizes: VecMap::new(), + bucket_start: Vec::with_capacity(n), + bucket_end: Vec::with_capacity(n), + } + } + + /// Init buckets. + fn init_bucket_start(&mut self, text: &[T]) { + self.bucket_sizes.clear(); + self.bucket_start.clear(); + + for &c in text.iter() { + if !self.bucket_sizes.contains_key(cast(c).unwrap()) { + self.bucket_sizes.insert(cast(c).unwrap(), 0); + } + *(self.bucket_sizes.get_mut(cast(c).unwrap()).unwrap()) += 1; + } + + let mut sum = 0; + for &size in self.bucket_sizes.values() { + self.bucket_start.push(sum); + sum += size; + } + } + + /// Initialize pointers to the last element of the buckets. + fn init_bucket_end(&mut self, text: &[T]) { + self.bucket_end.clear(); + for &r in self.bucket_start[1..].iter() { + self.bucket_end.push(r - 1); + } + self.bucket_end.push(text.len() - 1); + } + + /// Check if two LMS substrings are equal. + fn lms_substring_eq( + &self, + text: &[T], + pos_types: &PosTypes, + i: usize, + j: usize, + ) -> bool { + for k in 0.. { + let lmsi = pos_types.is_lms_pos(i + k); + let lmsj = pos_types.is_lms_pos(j + k); + if text[i + k] != text[j + k] { + // different symbols + return false; + } + if lmsi != lmsj { + // different length + return false; + } + if k > 0 && lmsi && lmsj { + // same symbols and same length + return true; + } + } + false + } + + /// Sort LMS suffixes. + fn sort_lms_suffixes< + T: Integer + Unsigned + NumCast + Copy + Debug, + S: Integer + Unsigned + NumCast + Copy + Debug, + >( + &mut self, + text: &[T], + pos_types: &PosTypes, + lms_substring_count: usize, + ) { + // if less than 2 LMS substrings are present, no further sorting is needed + if lms_substring_count > 1 { + // sort LMS suffixes by recursively building SA on reduced text + let mut reduced_text: Vec = vec![cast(0).unwrap(); lms_substring_count]; + let mut label = 0; + reduced_text[self.reduced_text_pos[self.pos[0]]] = cast(label).unwrap(); + let mut prev = None; + for &p in &self.pos { + if pos_types.is_lms_pos(p) { + // choose same label if substrings are equal + if prev.is_some() && !self.lms_substring_eq(text, pos_types, prev.unwrap(), p) { + label += 1; + } + reduced_text[self.reduced_text_pos[p]] = cast(label).unwrap(); + prev = Some(p); + } + } + + // if we have less labels than substrings, we have to sort by recursion + // because two or more substrings are equal + if label + 1 < lms_substring_count { + // backup lms_pos + let lms_pos = self.lms_pos.clone(); + // recurse SA construction for reduced text + self.construct(&reduced_text); + // obtain sorted lms suffixes + self.lms_pos.clear(); + for &p in &self.pos { + self.lms_pos.push(lms_pos[p]); + } + } else { + // otherwise, lms_pos is updated with the sorted suffixes from pos + // obtain sorted lms suffixes + self.lms_pos.clear(); + for &p in &self.pos { + if pos_types.is_lms_pos(p) { + self.lms_pos.push(p); + } + } + } + } + } + + /// Construct the suffix array. + fn construct(&mut self, text: &[T]) { + let pos_types = PosTypes::new(text); + self.calc_lms_pos(text, &pos_types); + self.calc_pos(text, &pos_types); + } + + /// Step 1 of the SAIS algorithm. + fn calc_lms_pos(&mut self, text: &[T], pos_types: &PosTypes) { + let n = text.len(); + + // collect LMS positions + self.lms_pos.clear(); + let mut i = 0; + for r in 0..n { + if pos_types.is_lms_pos(r) { + self.lms_pos.push(r); + self.reduced_text_pos[r] = i; + i += 1; + } + } + + // sort LMS substrings by applying step 2 with unsorted LMS positions + self.calc_pos(text, pos_types); + + let lms_substring_count = self.lms_pos.len(); + + if u8::try_from(lms_substring_count).is_ok() { + self.sort_lms_suffixes::(text, pos_types, lms_substring_count); + } else if u16::try_from(lms_substring_count).is_ok() { + self.sort_lms_suffixes::(text, pos_types, lms_substring_count); + } else if u32::try_from(lms_substring_count).is_ok() { + self.sort_lms_suffixes::(text, pos_types, lms_substring_count); + } else { + self.sort_lms_suffixes::(text, pos_types, lms_substring_count); + } + } + + /// Step 2 of the SAIS algorithm. + fn calc_pos(&mut self, text: &[T], pos_types: &PosTypes) { + let n = text.len(); + self.pos.clear(); + + self.init_bucket_start(text); + self.init_bucket_end(text); + + // init all positions as unknown (n-1 is max position) + self.pos.resize(n, n); + + // insert LMS positions to the end of their buckets + for &p in self.lms_pos.iter().rev() { + let c: usize = cast(text[p]).unwrap(); + self.pos[self.bucket_end[c]] = p; + // subtract without overflow: last -1 will cause overflow, but it does not matter + self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1); + } + + // reset bucket ends + self.init_bucket_end(text); + + // insert L-positions into buckets + for r in 0..n { + let p = self.pos[r]; + // ignore undefined positions and the zero since it has no predecessor + if p == n || p == 0 { + continue; + } + let pred = p - 1; + if pos_types.is_l_pos(pred) { + let c: usize = cast(text[pred]).unwrap(); + self.pos[self.bucket_start[c]] = pred; + self.bucket_start[c] += 1; + } + } + + // insert S-positions into buckets + for r in (0..n).rev() { + let p = self.pos[r]; + if p == 0 { + continue; + } + let pred = p - 1; + if pos_types.is_s_pos(pred) { + let c: usize = cast(text[pred]).unwrap(); + self.pos[self.bucket_end[c]] = pred; + // subtract without overflow: last -1 will cause overflow, but it won't be used + self.bucket_end[c] = self.bucket_end[c].wrapping_sub(1); + } + } + } +} + +/// Position types (L or S). +#[derive(Debug)] +struct PosTypes { + pos_types: BitVec, +} + +impl PosTypes { + /// Calculate the text position type. + /// L-type marks suffixes being lexicographically larger than their successor, + /// S-type marks the others. + /// This function fills a BitVec, with 1-bits denoting S-type + /// and 0-bits denoting L-type. + /// + /// # Arguments + /// + /// * `text` - the text, ending with a sentinel. + fn new(text: &[T]) -> Self { + let n = text.len(); + let mut pos_types = BitVec::new_fill(false, n as u64); + pos_types.set_bit(n as u64 - 1, true); + + for p in (0..n - 1).rev() { + if text[p] == text[p + 1] { + // if the characters are equal, the next position determines + // the lexicographical order + let v = pos_types.get_bit(p as u64 + 1); + pos_types.set_bit(p as u64, v); + } else { + pos_types.set_bit(p as u64, text[p] < text[p + 1]); + } + } + + PosTypes { pos_types } + } + + /// Check if p is S-position. + fn is_s_pos(&self, p: usize) -> bool { + self.pos_types.get_bit(p as u64) + } + + /// Check if p is L-position. + fn is_l_pos(&self, p: usize) -> bool { + !self.pos_types.get_bit(p as u64) + } + + /// Check if p is LMS-position. + fn is_lms_pos(&self, p: usize) -> bool { + p != 0 && self.is_s_pos(p) && self.is_l_pos(p - 1) + } +} + +#[cfg(test)] +mod tests { + #![allow(clippy::string_add, clippy::needless_range_loop)] + + use super::*; + use super::{transform_text, PosTypes, Sais}; + use crate::alphabets::{dna, Alphabet}; + use crate::data_structures::bwt::{bwt, less}; + use bv::{BitVec, BitsPush}; + use rand; + use rand::prelude::*; + use std::str; + + #[test] + fn test_pos_types() { + let orig_text = b"GCCTTAACATTATTACGCCTA$"; + let alphabet = Alphabet::new(orig_text); + let text: Vec = transform_text(orig_text, &alphabet, 1); + let n = text.len(); + + let pos_types = PosTypes::new(&text); + //let mut test = BitSlice::from_slice(&[0b01100110, 0b10010011, 0b01100100]).to_owned(); + let mut test = BitVec::new(); + test.push_block(0b001001101100100101100110); + test.truncate(n as u64); + assert_eq!(pos_types.pos_types, test); + let lms_pos: Vec = (0..n).filter(|&p| pos_types.is_lms_pos(p)).collect(); + assert_eq!(lms_pos, vec![1, 5, 8, 11, 14, 17, 21]); + } + + #[test] + fn test_buckets() { + let orig_text = b"GCCTTAACATTATTACGCCTA$"; + let alphabet = Alphabet::new(orig_text); + let text: Vec = transform_text(orig_text, &alphabet, 1); + let n = text.len(); + + let mut sais = Sais::new(n); + sais.init_bucket_start(&text); + assert_eq!(sais.bucket_start, vec![0, 1, 7, 13, 15]); + sais.init_bucket_end(&text); + assert_eq!(sais.bucket_end, vec![0, 6, 12, 14, 21]); + } + + #[test] + fn test_pos() { + let orig_text = b"GCCTTAACATTATTACGCCTA$"; + let alphabet = Alphabet::new(orig_text); + let text: Vec = transform_text(orig_text, &alphabet, 1); + let n = text.len(); + + let mut sais = Sais::new(n); + let pos_types = PosTypes::new(&text); + sais.lms_pos = vec![21, 5, 14, 8, 11, 17, 1]; + sais.calc_pos(&text, &pos_types); + assert_eq!( + sais.pos, + vec![21, 20, 5, 6, 14, 11, 8, 7, 17, 1, 15, 18, 2, 16, 0, 19, 4, 13, 10, 3, 12, 9,] + ); + } + + #[test] + fn test_lms_pos() { + let orig_text = b"GCCTTAACATTATTACGCCTA$"; + let alphabet = Alphabet::new(orig_text); + let text: Vec = transform_text(orig_text, &alphabet, 1); + let n = text.len(); + + let mut sais = Sais::new(n); + let pos_types = PosTypes::new(&text); + sais.calc_lms_pos(&text, &pos_types); + } + + #[test] + fn test_issue10_1() { + let text = b"TGTGTGTGTG$"; + let pos = suffix_array(text); + assert_eq!(pos, [10, 9, 7, 5, 3, 1, 8, 6, 4, 2, 0]); + } + + #[test] + fn test_issue10_2() { + let text = b"TGTGTGTG$"; + let pos = suffix_array(text); + assert_eq!(pos, [8, 7, 5, 3, 1, 6, 4, 2, 0]); + } + + #[test] + fn test_handles_sentinels_properly() { + let reads = b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"; + suffix_array(reads); + } + + fn str_from_pos(sa: &[usize], text: &[u8], index: usize) -> String { + String::from( + str::from_utf8(&text[sa[index]..]) + .unwrap() + .split('$') + .next() + .unwrap_or(""), + ) + "$" + } + + fn rand_seqs(num_seqs: usize, seq_len: usize) -> Vec { + let mut rng = rand::thread_rng(); + let alpha = [b'A', b'T', b'C', b'G', b'N']; + let seqs = (0..num_seqs) + .map(|_| { + let len = rng.gen_range((seq_len / 2)..=seq_len); + (0..len).map(|_| *alpha.choose(&mut rng).unwrap()).collect::>() + }) + .collect::>(); + let mut res = seqs.join(&b'$'); + res.push(b'$'); + res + } + + #[test] + fn test_sorts_lexically() { + let mut test_cases = vec![ + (&b"A$C$G$T$"[..], "simple"), + (&b"A$A$T$T$"[..], "duplicates"), + (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"), + ( + &b"AGCCAT$\ + CAGCC$"[..], + "substring", + ), + ( + &b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\ + AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$" + [..], + "complex", + ), + ( + &b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\ + TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\ + AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\ + TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$" + [..], + "complex with revcomps", + ), + ]; + let num_rand = 100; + let rand_cases = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect::>(); + for i in 0..num_rand { + test_cases.push((&rand_cases[i], "rand test case")); + } + + for &(text, test_name) in &test_cases { + let pos = suffix_array(text); + for i in 0..(pos.len() - 2) { + // Check that every element in the suffix array is lexically <= the next elem + let cur = str_from_pos(&pos, text, i); + let next = str_from_pos(&pos, text, i + 1); + + assert!( + cur <= next, + "Failed:\n{}\n{}\nat positions {} and {} are out of order in \ + test: {}", + cur, + next, + pos[i], + pos[i + 1], + test_name + ); + } + } + } + + #[test] + fn test_sampled_matches() { + let mut test_cases = vec![(&b"A$C$G$T$"[..], "simple"), + (&b"A$A$T$T$"[..], "duplicates"), + (&b"AA$GA$CA$TA$TC$TG$GT$GC$"[..], "two letter"), + (&b"AGCCAT$\ + CAGCC$"[..], + "substring"), + (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\ + AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$"[..], + "complex"), + (&b"GTAGGCCTAATTATAATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAA$\ + TTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATTATAATTAGGCCTAC$\ + AATCAGCGGACATTTCGTATTGCTCGGGCTGCCAGGATTTTAGCATCAGTAGCCGGGTAATGGAACCTCAAGAGGTCAGCGTCGAATGGCTATTCCAATA$\ + TATTGGAATAGCCATTCGACGCTGACCTCTTGAGGTTCCATTACCCGGCTACTGATGCTAAAATCCTGGCAGCCCGAGCAATACGAAATGTCCGCTGATT$"[..], + "complex with revcomps"), + (&b"GTAG$GCCTAAT$TATAATCAG$"[..], "issue70"), + (&b"TGTGTGTGTG$"[..], "repeating"), + (&b"TACTCCGCTAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTT$AAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTAGCGGAGTA$TAGGGACACCTAAATAGATACTCGCAAAGGCGACTGATATATCCTTAGGTCGAAGAGATACCAGAGAAATAGTAGGTCTTAGGCTAGTCCTTGTCCAGTA$TACTGGACAAGGACTAGCCTAAGACCTACTATTTCTCTGGTATCTCTTCGACCTAAGGATATATCAGTCGCCTTTGCGAGTATCTATTTAGGTGTCCCTA$ACGCACCCCGGCATTCGTCGACTCTACACTTAGTGGAACATACAAATTCGCTCGCAGGAGCGCCTCATACATTCTAACGCAGTGATCTTCGGCTGAGACT$AGTCTCAGCCGAAGATCACTGCGTTAGAATGTATGAGGCGCTCCTGCGAGCGAATTTGTATGTTCCACTAAGTGTAGAGTCGACGAATGCCGGGGTGCGT$"[..], "complex sentinels"), + ]; + let num_rand = 100; + let rand_cases = (0..num_rand).map(|i| rand_seqs(10, i * 10)).collect::>(); + for i in 0..num_rand { + test_cases.push((&rand_cases[i], "rand test case")); + } + + for &(text, test_name) in &test_cases { + for &sample_rate in &[2, 3, 5, 16] { + let alphabet = dna::n_alphabet(); + let sa = suffix_array(text); + let bwt = bwt(text, &sa); + let less = less(&bwt, &alphabet); + let occ = Occ::new(&bwt, 3, &alphabet); + let sampled = sa.sample(text, &bwt, &less, &occ, sample_rate); + + for i in 0..sa.len() { + let sa_idx = sa[i]; + let sampled_idx = sampled.get(i).unwrap(); + assert_eq!( + sa_idx, + sampled_idx, + "Failed:\n{}\n{}\nat index {} do not match in test: {} (sample rate: {})", + str::from_utf8(&text[sa_idx..]).unwrap(), + str::from_utf8(&text[sampled_idx..]).unwrap(), + i, + test_name, + sample_rate + ); + } + } + } + } +} diff --git a/packages_rs/3rdparty/bio_inline/src/lib.rs b/packages_rs/3rdparty/bio_inline/src/lib.rs new file mode 100644 index 000000000..302459984 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/lib.rs @@ -0,0 +1,10 @@ +#![allow(clippy::if_then_some_else_none)] +#![allow(clippy::many_single_char_names)] +#![allow(clippy::missing_const_for_fn)] +#![allow(clippy::missing_safety_doc)] +#![allow(clippy::upper_case_acronyms)] + +pub mod alphabets; +pub mod data_structures; +pub mod utils; +pub use bio_types; diff --git a/packages_rs/3rdparty/bio_inline/src/utils/mod.rs b/packages_rs/3rdparty/bio_inline/src/utils/mod.rs new file mode 100644 index 000000000..e49abb8b6 --- /dev/null +++ b/packages_rs/3rdparty/bio_inline/src/utils/mod.rs @@ -0,0 +1,37 @@ +/// In place implementation of scan over a slice. +pub fn scan T>(a: &mut [T], op: F) { + let mut s = a[0]; + for v in a.iter_mut().skip(1) { + s = op(s, *v); + *v = s; + } +} + +// Inplace implementation of prescan over a slice. +pub fn prescan T>(a: &mut [T], neutral: T, op: F) { + let mut s = neutral; + for v in a.iter_mut() { + let t = *v; + *v = s; + s = op(s, t); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scan() { + let mut a = vec![1, 0, 0, 1]; + scan(&mut a[..], |a, b| a + b); + assert_eq!(a, vec![1, 1, 1, 2]); + } + + #[test] + fn test_prescan() { + let mut a = vec![1, 0, 0, 1]; + prescan(&mut a[..], 0, |a, b| a + b); + assert_eq!(a, vec![0, 1, 1, 1]); + } +} diff --git a/packages_rs/nextclade/Cargo.toml b/packages_rs/nextclade/Cargo.toml index 49ba05d62..0250dbe3e 100644 --- a/packages_rs/nextclade/Cargo.toml +++ b/packages_rs/nextclade/Cargo.toml @@ -17,6 +17,7 @@ assert2 = "=0.3.11" auto_ops = "=0.3.0" bio = "=1.3.1" bio-types = "=1.0.0" +bio_inline = { path = "../3rdparty/bio_inline" } chrono = { version = "=0.4.26", default-features = false, features = ["clock", "std", "wasmbind"] } clap = { version = "=4.3.10", features = ["derive"] } clap-verbosity-flag = "=2.0.1" diff --git a/packages_rs/nextclade/src/align/seed_match2.rs b/packages_rs/nextclade/src/align/seed_match2.rs index 4f51de4ff..c2ed47d8e 100644 --- a/packages_rs/nextclade/src/align/seed_match2.rs +++ b/packages_rs/nextclade/src/align/seed_match2.rs @@ -2,10 +2,10 @@ use crate::align::params::AlignPairwiseParams; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::{from_nuc_seq, Nuc}; use crate::make_error; -use bio::alphabets; -use bio::data_structures::bwt::{bwt, less, Less, Occ, BWT}; -use bio::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable}; -use bio::data_structures::suffix_array::suffix_array; +use bio_inline::alphabets; +use bio_inline::data_structures::bwt::{bwt, less, Less, Occ, BWT}; +use bio_inline::data_structures::fmindex::{BackwardSearchResult, FMIndex, FMIndexable}; +use bio_inline::data_structures::suffix_array::suffix_array; use eyre::Report; use gcollections::ops::{Bounded, Intersection, IsEmpty, Union}; use interval::interval_set::{IntervalSet, ToIntervalSet};