From 37895048269872a3910f9dcd8da5e18d1fd53bf3 Mon Sep 17 00:00:00 2001 From: Jonathan Johnson Date: Sun, 30 Jul 2023 14:34:14 -0700 Subject: [PATCH] Simplified code for SymbolMaps The previous code had several code paths that were unreachable. This refactor gets rid of them. --- CHANGELOG.md | 1 + README.md | 2 +- benchmarks/examples/logs.rs | 2 +- pot/src/de.rs | 170 +++++++++++++++++++++--------------- pot/src/lib.rs | 70 ++++++++++++++- 5 files changed, 171 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d41a10e8..4e91d3fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 used when reading an associated `Nucleus::Bytes`. When `Nucleus::Bytes` contains `BufferedBytes::Scratch`, `scratch` will contain the bytes contained in the atom. +- `SymbolMapRef` is now a struct with private contents. ### Changed diff --git a/README.md b/README.md index 3705718b..92606a23 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ provide an encoding format for [`serde`](https://serde.rs) that: * Is compact. While still being self-describing, Pot's main space-saving feature is not repeating symbols/identifiers more than one time while serializing. When serializing arrays of structures, this can make a major difference. The - [logs.rs](https://github.com/khonsulabs/pot/blob/main/pot/examples/logs.rs) + [logs.rs](https://github.com/khonsulabs/pot/blob/main/benchmarks/examples/logs.rs) example demonstrates this: ```sh diff --git a/benchmarks/examples/logs.rs b/benchmarks/examples/logs.rs index c0cbc5e2..d52bc72b 100644 --- a/benchmarks/examples/logs.rs +++ b/benchmarks/examples/logs.rs @@ -68,7 +68,7 @@ fn main() -> anyhow::Result<()> { // In this situation, the payloads across a network are generally smaller, // so let's show the benefits by just encoding a single log entry. let mut sender_state = pot::ser::SymbolMap::default(); - let mut receiver_state = pot::de::SymbolMap::new(); + let mut receiver_state = pot::de::SymbolMap::default(); let mut payload_buffer = Vec::new(); logs.entries[0].serialize(&mut sender_state.serializer_for(&mut payload_buffer)?)?; let first_transmission_length = payload_buffer.len(); diff --git a/pot/src/de.rs b/pot/src/de.rs index c4cf08a7..4aaf17d0 100644 --- a/pot/src/de.rs +++ b/pot/src/de.rs @@ -1,5 +1,6 @@ use std::collections::VecDeque; use std::fmt::Debug; +use std::io::Read; use std::ops::{Deref, Range}; use byteorder::ReadBytesExt; @@ -19,7 +20,7 @@ use crate::{Error, Result}; /// Deserializer for the Pot format. pub struct Deserializer<'s, 'de, R: Reader<'de>> { input: R, - symbols: SymbolMap<'s, 'de>, + symbols: SymbolMapRef<'s, 'de>, peeked_atom: VecDeque>, remaining_budget: usize, scratch: Vec, @@ -39,16 +40,12 @@ impl<'s, 'de> Deserializer<'s, 'de, SliceReader<'de>> { /// Returns a new deserializer for `input`. #[inline] pub(crate) fn from_slice(input: &'de [u8], maximum_bytes_allocatable: usize) -> Result { - Self::from_slice_with_symbols( - input, - SymbolMap::Borrowed(SymbolList::new()), - maximum_bytes_allocatable, - ) + Self::from_slice_with_symbols(input, SymbolMapRef::temporary(), maximum_bytes_allocatable) } fn from_slice_with_symbols( input: &'de [u8], - symbols: SymbolMap<'s, 'de>, + symbols: SymbolMapRef<'s, 'de>, maximum_bytes_allocatable: usize, ) -> Result { Self::new(SliceReader::from(input), symbols, maximum_bytes_allocatable) @@ -65,12 +62,12 @@ impl<'s, 'de> Deserializer<'s, 'de, SliceReader<'de>> { impl<'s, 'de, R: ReadBytesExt> Deserializer<'s, 'de, IoReader> { /// Returns a new deserializer for `input`. #[inline] - pub(crate) fn from_read(input: R, maximum_bytes_allocatable: usize) -> Result { - Self::new( - IoReader::new(input), - SymbolMap::Borrowed(SymbolList::new()), - maximum_bytes_allocatable, - ) + pub(crate) fn from_read( + input: R, + symbols: SymbolMapRef<'s, 'de>, + maximum_bytes_allocatable: usize, + ) -> Result { + Self::new(IoReader::new(input), symbols, maximum_bytes_allocatable) } } @@ -78,7 +75,7 @@ impl<'s, 'de, R: Reader<'de>> Deserializer<'s, 'de, R> { #[inline] pub(crate) fn new( input: R, - symbols: SymbolMap<'s, 'de>, + symbols: SymbolMapRef<'s, 'de>, maximum_bytes_allocatable: usize, ) -> Result { let mut deserializer = Deserializer { @@ -1047,89 +1044,61 @@ impl<'a, 's, 'de, R: Reader<'de>> VariantAccess<'de> for &'a mut Deserializer<'s } } -/// A collection of deserialized symbols. +/// A reference to a [`SymbolList`]. #[derive(Debug)] -pub enum SymbolMap<'a, 'de> { - /// An owned list of symbols. - Owned(SymbolList<'static>), - /// A mutable reference to an owned list of symbols. - Persistent(&'a mut SymbolList<'static>), - /// A list of borrowed symbols. - Borrowed(SymbolList<'de>), -} +pub struct SymbolMapRef<'a, 'de>(SymbolMapRefPrivate<'a, 'de>); -impl<'de> SymbolMap<'static, 'de> { - /// Returns a new symbol map that will persist symbols between payloads. - #[must_use] - #[inline] - pub const fn new() -> Self { - Self::Owned(SymbolList::new()) - } +#[derive(Debug)] +enum SymbolMapRefPrivate<'a, 'de> { + /// A reference to a temporary symbol list, which is able to borrow from the + /// source data. + Temporary(SymbolList<'de>), + /// A reference to a persistent symbol list that retains symbols across + /// multiple deserialization sessions. + Persistent(&'a mut SymbolMap), +} - /// Returns a deserializer for `slice`. - #[inline] - pub fn deserializer_for_slice<'a>( - &'a mut self, - slice: &'de [u8], - ) -> Result>> { - Deserializer::from_slice_with_symbols(slice, self.persistent(), usize::MAX) +impl<'a, 'de> SymbolMapRef<'a, 'de> { + pub(crate) const fn temporary() -> Self { + Self(SymbolMapRefPrivate::Temporary(SymbolList::new())) } - #[must_use] - fn persistent(&mut self) -> SymbolMap<'_, 'de> { - match self { - Self::Owned(vec) => SymbolMap::Persistent(vec), - Self::Persistent(vec) => SymbolMap::Persistent(vec), - Self::Borrowed(_) => unreachable!(), - } - } -} - -impl<'a, 'de> SymbolMap<'a, 'de> { #[allow(clippy::cast_possible_truncation)] fn visit_symbol_id(&self, symbol_id: u64, visitor: V) -> Result where V: Visitor<'de>, { - match self { - Self::Owned(vec) => { + match &self.0 { + SymbolMapRefPrivate::Temporary(vec) => { let symbol = vec .get(symbol_id as usize) .ok_or(Error::UnknownSymbol(symbol_id))?; - visitor.visit_str(&symbol) + match symbol { + SymbolStr::Data(symbol) => visitor.visit_borrowed_str(symbol), + SymbolStr::InList(symbol) => visitor.visit_str(symbol), + } } - Self::Persistent(vec) => { + SymbolMapRefPrivate::Persistent(vec) => { let symbol = vec .get(symbol_id as usize) .ok_or(Error::UnknownSymbol(symbol_id))?; visitor.visit_str(&symbol) } - Self::Borrowed(vec) => { - let symbol = vec - .get(symbol_id as usize) - .ok_or(Error::UnknownSymbol(symbol_id))?; - match symbol { - SymbolStr::Data(symbol) => visitor.visit_borrowed_str(symbol), - SymbolStr::InList(symbol) => visitor.visit_str(symbol), - } - } } } fn push(&mut self, symbol: &str) { #[allow(clippy::match_same_arms)] // false positive due to lifetimes - match self { - Self::Owned(vec) => vec.push(symbol), - Self::Persistent(vec) => vec.push(symbol), - Self::Borrowed(vec) => vec.push(symbol), + match &mut self.0 { + SymbolMapRefPrivate::Temporary(vec) => vec.push(symbol), + SymbolMapRefPrivate::Persistent(vec) => vec.push(symbol), } } fn push_borrowed(&mut self, symbol: &'de str) { - match self { - Self::Owned(vec) => vec.push(symbol), - Self::Persistent(vec) => vec.push(symbol), - Self::Borrowed(vec) => vec.push_borrowed(symbol), + match &mut self.0 { + SymbolMapRefPrivate::Temporary(vec) => vec.push_borrowed(symbol), + SymbolMapRefPrivate::Persistent(vec) => vec.push(symbol), } } } @@ -1141,8 +1110,17 @@ pub struct SymbolList<'de> { entries: Vec>, } +impl Default for SymbolList<'_> { + fn default() -> Self { + Self::new() + } +} + impl<'de> SymbolList<'de> { - const fn new() -> Self { + /// Returns a new, empty symbol list. + #[inline] + #[must_use] + pub const fn new() -> Self { Self { buffer: String::new(), entries: Vec::new(), @@ -1174,6 +1152,58 @@ impl<'de> SymbolList<'de> { SymbolListEntry::Borrowed(str) => Some(SymbolStr::Data(str)), } } + + /// Returns the number of entries in the symbol list. + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.entries.len() + } + + /// Returns true if there are no symbols in this list. + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// An alias to a [`SymbolList`] with a static lifetime. This type persists +/// symbols referenced across multiple deserialization sessions. +pub type SymbolMap = SymbolList<'static>; + +impl SymbolMap { + /// Returns a deserializer for `slice` that reuses symbol ids. + /// + /// This should only be used with data generated by using a persistent + /// [`ser::SymbolMap`](crate::ser::SymbolMap). + #[inline] + pub fn deserializer_for_slice<'a, 'de>( + &'a mut self, + slice: &'de [u8], + ) -> Result>> { + Deserializer::from_slice_with_symbols(slice, self.persistent(), usize::MAX) + } + + /// Returns a deserializer for `reader`. + /// + /// This should only be used with data generated by using a persistent + /// [`ser::SymbolMap`](crate::ser::SymbolMap). + #[inline] + pub fn deserializer_for<'de, R>( + &mut self, + reader: R, + ) -> Result>> + where + R: Read, + { + Deserializer::from_read(reader, self.persistent(), usize::MAX) + } + + #[must_use] + fn persistent<'de>(&mut self) -> SymbolMapRef<'_, 'de> { + SymbolMapRef(SymbolMapRefPrivate::Persistent(self)) + } } /// A symbol stored in a [`SymbolList`]. diff --git a/pot/src/lib.rs b/pot/src/lib.rs index 78341196..89881170 100644 --- a/pot/src/lib.rs +++ b/pot/src/lib.rs @@ -36,6 +36,7 @@ pub type Result = std::result::Result; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; +use crate::de::SymbolMapRef; use crate::reader::IoReader; /// Serialize `value` using Pot into a `Vec`. @@ -154,8 +155,11 @@ impl Config { where T: DeserializeOwned, { - let mut deserializer = - de::Deserializer::from_read(IoReader::new(reader), self.allocation_budget)?; + let mut deserializer = de::Deserializer::from_read( + IoReader::new(reader), + SymbolMapRef::temporary(), + self.allocation_budget, + )?; T::deserialize(&mut deserializer) } @@ -780,4 +784,66 @@ mod tests { Err(ValueError::Custom(String::from("oh no!"))) ); } + + #[test] + fn persistent_symbols_slice() { + let mut sender = ser::SymbolMap::default(); + let mut receiver = de::SymbolList::default(); + + let mut bytes = Vec::new(); + NumbersStruct::default() + .serialize(&mut sender.serializer_for(&mut bytes).unwrap()) + .unwrap(); + let _result = + NumbersStruct::deserialize(&mut receiver.deserializer_for_slice(&bytes).unwrap()) + .unwrap(); + let symbol_count_after_first_send = receiver.len(); + let first_payload_len = bytes.len(); + + // Send again, confirm the symbol list didn't grow. + bytes.clear(); + NumbersStruct::default() + .serialize(&mut sender.serializer_for(&mut bytes).unwrap()) + .unwrap(); + let _result = + NumbersStruct::deserialize(&mut receiver.deserializer_for_slice(&bytes).unwrap()) + .unwrap(); + assert_eq!(symbol_count_after_first_send, receiver.len()); + println!( + "First: {first_payload_len} bytes; Second: {} bytes", + bytes.len() + ); + assert!(first_payload_len > bytes.len()); + } + + #[test] + fn persistent_symbols_read() { + let mut sender = ser::SymbolMap::default(); + let mut receiver = de::SymbolList::default(); + + let mut bytes = Vec::new(); + NumbersStruct::default() + .serialize(&mut sender.serializer_for(&mut bytes).unwrap()) + .unwrap(); + let _result = + NumbersStruct::deserialize(&mut receiver.deserializer_for(&bytes[..]).unwrap()) + .unwrap(); + let symbol_count_after_first_send = receiver.len(); + let first_payload_len = bytes.len(); + + // Send again, confirm the symbol list didn't grow. + bytes.clear(); + NumbersStruct::default() + .serialize(&mut sender.serializer_for(&mut bytes).unwrap()) + .unwrap(); + let _result = + NumbersStruct::deserialize(&mut receiver.deserializer_for(&bytes[..]).unwrap()) + .unwrap(); + assert_eq!(symbol_count_after_first_send, receiver.len()); + println!( + "First: {first_payload_len} bytes; Second: {} bytes", + bytes.len() + ); + assert!(first_payload_len > bytes.len()); + } }