Skip to content

Commit

Permalink
Simplified code for SymbolMaps
Browse files Browse the repository at this point in the history
The previous code had several code paths that were unreachable. This
refactor gets rid of them.
  • Loading branch information
ecton committed Jul 30, 2023
1 parent 83a9af1 commit 3789504
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 74 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
used when reading an associated `Nucleus::Bytes`. When `Nucleus::Bytes`
contains `BufferedBytes::Scratch`, `scratch` will contain the bytes contained
in the atom.
- `SymbolMapRef` is now a struct with private contents.

### Changed

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ provide an encoding format for [`serde`](https://serde.rs) that:
* Is compact. While still being self-describing, Pot's main space-saving feature
is not repeating symbols/identifiers more than one time while serializing.
When serializing arrays of structures, this can make a major difference. The
[logs.rs](https://github.com/khonsulabs/pot/blob/main/pot/examples/logs.rs)
[logs.rs](https://github.com/khonsulabs/pot/blob/main/benchmarks/examples/logs.rs)
example demonstrates this:

```sh
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/examples/logs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ fn main() -> anyhow::Result<()> {
// In this situation, the payloads across a network are generally smaller,
// so let's show the benefits by just encoding a single log entry.
let mut sender_state = pot::ser::SymbolMap::default();
let mut receiver_state = pot::de::SymbolMap::new();
let mut receiver_state = pot::de::SymbolMap::default();
let mut payload_buffer = Vec::new();
logs.entries[0].serialize(&mut sender_state.serializer_for(&mut payload_buffer)?)?;
let first_transmission_length = payload_buffer.len();
Expand Down
170 changes: 100 additions & 70 deletions pot/src/de.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::collections::VecDeque;
use std::fmt::Debug;
use std::io::Read;
use std::ops::{Deref, Range};

use byteorder::ReadBytesExt;
Expand All @@ -19,7 +20,7 @@ use crate::{Error, Result};
/// Deserializer for the Pot format.
pub struct Deserializer<'s, 'de, R: Reader<'de>> {
input: R,
symbols: SymbolMap<'s, 'de>,
symbols: SymbolMapRef<'s, 'de>,
peeked_atom: VecDeque<Atom<'de>>,
remaining_budget: usize,
scratch: Vec<u8>,
Expand All @@ -39,16 +40,12 @@ impl<'s, 'de> Deserializer<'s, 'de, SliceReader<'de>> {
/// Returns a new deserializer for `input`.
#[inline]
pub(crate) fn from_slice(input: &'de [u8], maximum_bytes_allocatable: usize) -> Result<Self> {
Self::from_slice_with_symbols(
input,
SymbolMap::Borrowed(SymbolList::new()),
maximum_bytes_allocatable,
)
Self::from_slice_with_symbols(input, SymbolMapRef::temporary(), maximum_bytes_allocatable)
}

fn from_slice_with_symbols(
input: &'de [u8],
symbols: SymbolMap<'s, 'de>,
symbols: SymbolMapRef<'s, 'de>,
maximum_bytes_allocatable: usize,
) -> Result<Self> {
Self::new(SliceReader::from(input), symbols, maximum_bytes_allocatable)
Expand All @@ -65,20 +62,20 @@ impl<'s, 'de> Deserializer<'s, 'de, SliceReader<'de>> {
impl<'s, 'de, R: ReadBytesExt> Deserializer<'s, 'de, IoReader<R>> {
/// Returns a new deserializer for `input`.
#[inline]
pub(crate) fn from_read(input: R, maximum_bytes_allocatable: usize) -> Result<Self> {
Self::new(
IoReader::new(input),
SymbolMap::Borrowed(SymbolList::new()),
maximum_bytes_allocatable,
)
pub(crate) fn from_read(
input: R,
symbols: SymbolMapRef<'s, 'de>,
maximum_bytes_allocatable: usize,
) -> Result<Self> {
Self::new(IoReader::new(input), symbols, maximum_bytes_allocatable)
}
}

impl<'s, 'de, R: Reader<'de>> Deserializer<'s, 'de, R> {
#[inline]
pub(crate) fn new(
input: R,
symbols: SymbolMap<'s, 'de>,
symbols: SymbolMapRef<'s, 'de>,
maximum_bytes_allocatable: usize,
) -> Result<Self> {
let mut deserializer = Deserializer {
Expand Down Expand Up @@ -1047,89 +1044,61 @@ impl<'a, 's, 'de, R: Reader<'de>> VariantAccess<'de> for &'a mut Deserializer<'s
}
}

/// A collection of deserialized symbols.
/// A reference to a [`SymbolList`].
#[derive(Debug)]
pub enum SymbolMap<'a, 'de> {
/// An owned list of symbols.
Owned(SymbolList<'static>),
/// A mutable reference to an owned list of symbols.
Persistent(&'a mut SymbolList<'static>),
/// A list of borrowed symbols.
Borrowed(SymbolList<'de>),
}
pub struct SymbolMapRef<'a, 'de>(SymbolMapRefPrivate<'a, 'de>);

impl<'de> SymbolMap<'static, 'de> {
/// Returns a new symbol map that will persist symbols between payloads.
#[must_use]
#[inline]
pub const fn new() -> Self {
Self::Owned(SymbolList::new())
}
#[derive(Debug)]
enum SymbolMapRefPrivate<'a, 'de> {
/// A reference to a temporary symbol list, which is able to borrow from the
/// source data.
Temporary(SymbolList<'de>),
/// A reference to a persistent symbol list that retains symbols across
/// multiple deserialization sessions.
Persistent(&'a mut SymbolMap),
}

/// Returns a deserializer for `slice`.
#[inline]
pub fn deserializer_for_slice<'a>(
&'a mut self,
slice: &'de [u8],
) -> Result<Deserializer<'a, 'de, SliceReader<'de>>> {
Deserializer::from_slice_with_symbols(slice, self.persistent(), usize::MAX)
impl<'a, 'de> SymbolMapRef<'a, 'de> {
pub(crate) const fn temporary() -> Self {
Self(SymbolMapRefPrivate::Temporary(SymbolList::new()))
}

#[must_use]
fn persistent(&mut self) -> SymbolMap<'_, 'de> {
match self {
Self::Owned(vec) => SymbolMap::Persistent(vec),
Self::Persistent(vec) => SymbolMap::Persistent(vec),
Self::Borrowed(_) => unreachable!(),
}
}
}

impl<'a, 'de> SymbolMap<'a, 'de> {
#[allow(clippy::cast_possible_truncation)]
fn visit_symbol_id<V>(&self, symbol_id: u64, visitor: V) -> Result<V::Value>
where
V: Visitor<'de>,
{
match self {
Self::Owned(vec) => {
match &self.0 {
SymbolMapRefPrivate::Temporary(vec) => {
let symbol = vec
.get(symbol_id as usize)
.ok_or(Error::UnknownSymbol(symbol_id))?;
visitor.visit_str(&symbol)
match symbol {
SymbolStr::Data(symbol) => visitor.visit_borrowed_str(symbol),
SymbolStr::InList(symbol) => visitor.visit_str(symbol),
}
}
Self::Persistent(vec) => {
SymbolMapRefPrivate::Persistent(vec) => {
let symbol = vec
.get(symbol_id as usize)
.ok_or(Error::UnknownSymbol(symbol_id))?;
visitor.visit_str(&symbol)
}
Self::Borrowed(vec) => {
let symbol = vec
.get(symbol_id as usize)
.ok_or(Error::UnknownSymbol(symbol_id))?;
match symbol {
SymbolStr::Data(symbol) => visitor.visit_borrowed_str(symbol),
SymbolStr::InList(symbol) => visitor.visit_str(symbol),
}
}
}
}

fn push(&mut self, symbol: &str) {
#[allow(clippy::match_same_arms)] // false positive due to lifetimes
match self {
Self::Owned(vec) => vec.push(symbol),
Self::Persistent(vec) => vec.push(symbol),
Self::Borrowed(vec) => vec.push(symbol),
match &mut self.0 {
SymbolMapRefPrivate::Temporary(vec) => vec.push(symbol),
SymbolMapRefPrivate::Persistent(vec) => vec.push(symbol),
}
}

fn push_borrowed(&mut self, symbol: &'de str) {
match self {
Self::Owned(vec) => vec.push(symbol),
Self::Persistent(vec) => vec.push(symbol),
Self::Borrowed(vec) => vec.push_borrowed(symbol),
match &mut self.0 {
SymbolMapRefPrivate::Temporary(vec) => vec.push_borrowed(symbol),
SymbolMapRefPrivate::Persistent(vec) => vec.push(symbol),
}
}
}
Expand All @@ -1141,8 +1110,17 @@ pub struct SymbolList<'de> {
entries: Vec<SymbolListEntry<'de>>,
}

impl Default for SymbolList<'_> {
fn default() -> Self {
Self::new()
}
}

impl<'de> SymbolList<'de> {
const fn new() -> Self {
/// Returns a new, empty symbol list.
#[inline]
#[must_use]
pub const fn new() -> Self {
Self {
buffer: String::new(),
entries: Vec::new(),
Expand Down Expand Up @@ -1174,6 +1152,58 @@ impl<'de> SymbolList<'de> {
SymbolListEntry::Borrowed(str) => Some(SymbolStr::Data(str)),
}
}

/// Returns the number of entries in the symbol list.
#[inline]
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}

/// Returns true if there are no symbols in this list.
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

/// An alias to a [`SymbolList`] with a static lifetime. This type persists
/// symbols referenced across multiple deserialization sessions.
pub type SymbolMap = SymbolList<'static>;

impl SymbolMap {
/// Returns a deserializer for `slice` that reuses symbol ids.
///
/// This should only be used with data generated by using a persistent
/// [`ser::SymbolMap`](crate::ser::SymbolMap).
#[inline]
pub fn deserializer_for_slice<'a, 'de>(
&'a mut self,
slice: &'de [u8],
) -> Result<Deserializer<'a, 'de, SliceReader<'de>>> {
Deserializer::from_slice_with_symbols(slice, self.persistent(), usize::MAX)
}

/// Returns a deserializer for `reader`.
///
/// This should only be used with data generated by using a persistent
/// [`ser::SymbolMap`](crate::ser::SymbolMap).
#[inline]
pub fn deserializer_for<'de, R>(
&mut self,
reader: R,
) -> Result<Deserializer<'_, 'de, IoReader<R>>>
where
R: Read,
{
Deserializer::from_read(reader, self.persistent(), usize::MAX)
}

#[must_use]
fn persistent<'de>(&mut self) -> SymbolMapRef<'_, 'de> {
SymbolMapRef(SymbolMapRefPrivate::Persistent(self))
}
}

/// A symbol stored in a [`SymbolList`].
Expand Down
70 changes: 68 additions & 2 deletions pot/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};

use crate::de::SymbolMapRef;
use crate::reader::IoReader;

/// Serialize `value` using Pot into a `Vec<u8>`.
Expand Down Expand Up @@ -154,8 +155,11 @@ impl Config {
where
T: DeserializeOwned,
{
let mut deserializer =
de::Deserializer::from_read(IoReader::new(reader), self.allocation_budget)?;
let mut deserializer = de::Deserializer::from_read(
IoReader::new(reader),
SymbolMapRef::temporary(),
self.allocation_budget,
)?;
T::deserialize(&mut deserializer)
}

Expand Down Expand Up @@ -780,4 +784,66 @@ mod tests {
Err(ValueError::Custom(String::from("oh no!")))
);
}

#[test]
fn persistent_symbols_slice() {
let mut sender = ser::SymbolMap::default();
let mut receiver = de::SymbolList::default();

let mut bytes = Vec::new();
NumbersStruct::default()
.serialize(&mut sender.serializer_for(&mut bytes).unwrap())
.unwrap();
let _result =
NumbersStruct::deserialize(&mut receiver.deserializer_for_slice(&bytes).unwrap())
.unwrap();
let symbol_count_after_first_send = receiver.len();
let first_payload_len = bytes.len();

// Send again, confirm the symbol list didn't grow.
bytes.clear();
NumbersStruct::default()
.serialize(&mut sender.serializer_for(&mut bytes).unwrap())
.unwrap();
let _result =
NumbersStruct::deserialize(&mut receiver.deserializer_for_slice(&bytes).unwrap())
.unwrap();
assert_eq!(symbol_count_after_first_send, receiver.len());
println!(
"First: {first_payload_len} bytes; Second: {} bytes",
bytes.len()
);
assert!(first_payload_len > bytes.len());
}

#[test]
fn persistent_symbols_read() {
let mut sender = ser::SymbolMap::default();
let mut receiver = de::SymbolList::default();

let mut bytes = Vec::new();
NumbersStruct::default()
.serialize(&mut sender.serializer_for(&mut bytes).unwrap())
.unwrap();
let _result =
NumbersStruct::deserialize(&mut receiver.deserializer_for(&bytes[..]).unwrap())
.unwrap();
let symbol_count_after_first_send = receiver.len();
let first_payload_len = bytes.len();

// Send again, confirm the symbol list didn't grow.
bytes.clear();
NumbersStruct::default()
.serialize(&mut sender.serializer_for(&mut bytes).unwrap())
.unwrap();
let _result =
NumbersStruct::deserialize(&mut receiver.deserializer_for(&bytes[..]).unwrap())
.unwrap();
assert_eq!(symbol_count_after_first_send, receiver.len());
println!(
"First: {first_payload_len} bytes; Second: {} bytes",
bytes.len()
);
assert!(first_payload_len > bytes.len());
}
}

0 comments on commit 3789504

Please sign in to comment.