~joshleeb/fastcdc

d0cba74fef53030b4a6e98d03be3daf476a470df — Josh Leeb-du Toit 4 months ago fd12c06
Add chunking iterator
3 files changed, 200 insertions(+), 0 deletions(-)

M Cargo.toml
A src/chunk.rs
M src/lib.rs
M Cargo.toml => Cargo.toml +4 -0
@@ 4,3 4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]

[dev-dependencies]
quickcheck = "1.0.3"
quickcheck_macros = "1.0.0"

A src/chunk.rs => src/chunk.rs +189 -0
@@ 0,0 1,189 @@
use crate::{mask, Hasher, MASK_SIZE};

/// Chunking options.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Opts {
    /// Desired chunk length.
    pub desired_len: usize,
    /// Minimum chunk length.
    pub min_len: usize,
    /// Small mask, with an increased number of effective bits.
    mask_s: u64,
    /// Large mask, with a decreased number of effective bits.
    mask_l: u64,
}

impl Opts {
    /// Returns a new `Opts` given the desired chunk length, and chunking normalization level.
    ///
    /// # Panics
    ///
    /// If the masks derived from the `desired_len` and `norm_level` exceed the maximum mask size
    /// of 48 bits.  More specifically, if
    ///
    /// - log2(desired_len) + norm_level > 48; or
    /// - log2(desired_len) - norm_level < 1.
    pub fn new(desired_len: usize, norm_level: usize) -> Self {
        let n = log2(desired_len);

        // Determining the masks is based on the desired length, and the chunking normalization
        // level.  `mask_s` will contain more effective bits than `mask_l` (for `norm_level` > 0).
        //
        // When the prospective chunk len is smaller than the desired chunk len, `mask_s` is used
        // to make finding the chunk boundary harder.  When the prospective chunk len is greater
        // than desired, we use `mask_l` to make finding the chunk boundary easier, in that is is
        // more likely a chunk boundary is found.
        assert!(n + norm_level <= MASK_SIZE);
        let mask_s = mask(n + norm_level);
        assert!(n - norm_level >= 1);
        let mask_l = mask(n - norm_level);

        Self { desired_len, min_len: 0, mask_s, mask_l }
    }

    /// Returns a new `Opts` with the minimum chunk length set.
    ///
    /// # Panics
    ///
    /// If the `min_len` >= `desired_len`.
    pub fn with_min_len(self, min_len: usize) -> Self {
        assert!(min_len < self.desired_len);
        Self { min_len, ..self }
    }
}

impl Default for Opts {
    fn default() -> Self {
        Self::new(4096, 2).with_min_len(1024)
    }
}

/// Base-2 logarithm for usize integers.
fn log2(val: usize) -> usize {
    let fval = val as f64;
    fval.log2().ceil() as usize
}

/// Chunk returned from the chunking iterator [`Iter`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
    pub hash: u64,
    pub offset: usize,
    pub len: usize,
}

/// An iterator over the chunks determined by [`Hasher`] chunk boundaries.
pub struct Iter<'a, 't> {
    hasher: &'a mut Hasher<'t>,
    data: &'a [u8],
    opts: Opts,
    cursor: usize,
}

impl<'a, 't> Iter<'a, 't> {
    /// Returns a new chunking iterator.  This struct is created by the [`Hasher::chunks`] method.
    /// See its documentation for more.
    pub fn new(hasher: &'a mut Hasher<'t>, data: &'a [u8], opts: Opts) -> Self {
        Self { hasher, data, opts, cursor: 0 }
    }
}

impl<'a, 't> Iterator for Iter<'a, 't> {
    type Item = Chunk;

    fn next(&mut self) -> Option<Self::Item> {
        if self.cursor == self.data.len() {
            return None;
        }

        let Opts { desired_len, min_len, .. } = self.opts;
        let offset = self.cursor;
        while let Some(b) = self.data.get(self.cursor) {
            self.hasher.consume(*b);
            self.cursor += 1;

            // Sub-minimum chunk cut-point skipping.
            let len = self.cursor - offset;
            if len < min_len {
                continue;
            }

            // Normalized chunking to select which mask to use.
            let m = if len < desired_len { self.opts.mask_s } else { self.opts.mask_l };
            if self.hasher.is_chunk_boundary(m) {
                break;
            }
        }
        // NOTE(joshleeb): A possible optimization here would be to combine the final chunk with
        // this chunk if it is below the minimum size.  For example, rather than two chunks of
        // |8192|12| we have a final chunk |8204|.
        Some(Chunk { hash: self.hasher.digest(), offset, len: self.cursor - offset })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::MASK_SIZE;
    use quickcheck_macros::quickcheck;
    use std::cmp::max;

    const DATA: [u8; 42] = [0; 42];

    #[test]
    fn default_opts() {
        // Check that the default opts doesn't panic.
        Opts::default();
    }

    #[test]
    fn empty() {
        let mut hasher = Hasher::default();
        let mut chunks = Iter::new(&mut hasher, &[], opts(13));
        assert!(chunks.next().is_none());
    }

    #[test]
    fn remaining() {
        let mut hasher = Hasher::default();
        let chunks = Iter::new(&mut hasher, &DATA, opts(13));
        assert_eq!(
            chunks.collect::<Vec<_>>(),
            [Chunk { hash: hasher.digest(), offset: 0, len: DATA.len() }]
        );
    }

    #[quickcheck]
    fn total_len(data: Vec<u8>, n: usize) {
        let mut hasher = Hasher::default();
        let chunks = hasher.chunks(&data, opts(n));

        let len: usize = chunks.map(|c| c.len).sum();
        assert_eq!(len, data.len());
    }

    #[quickcheck]
    fn offset_contiguity(data: Vec<u8>, n: usize) {
        let mut hasher = Hasher::default();
        let chunks: Vec<_> = hasher.chunks(&data, opts(n)).collect();

        if !chunks.is_empty() {
            assert_eq!(chunks[0].offset, 0);
        }
        for cs in chunks.as_slice().windows(2) {
            assert_eq!(cs[0].offset + cs[0].len, cs[1].offset);
        }
    }

    #[quickcheck]
    fn non_zero_len(data: Vec<u8>, n: usize) {
        let mut hasher = Hasher::default();
        let mut chunks = hasher.chunks(&data, opts(n));
        assert!(chunks.all(|c| c.len > 0));
    }

    fn opts(mut n: usize) -> Opts {
        n = max(2, n % MASK_SIZE);
        Opts::new(2usize.pow(n as u32), 1)
    }
}

M src/lib.rs => src/lib.rs +7 -0
@@ 1,8 1,10 @@
pub use chunk::Chunk;
pub use table::Table;

use std::fmt::{self, Debug, Formatter};
use table::GEAR_TABLE;

pub mod chunk;
mod table;

/// Gear hasher.


@@ 39,6 41,11 @@ impl<'t> Hasher<'t> {
        self.hash
    }

    /// Returns an iterator over the chunks determined by chunk boundaries.
    pub fn chunks<'a>(&'a mut self, data: &'a [u8], opts: chunk::Opts) -> chunk::Iter<'a, 't> {
        chunk::Iter::new(self, data, opts)
    }

    /// Returns true if the current digest is a chunk boundary with the `mask`, and false
    /// otherwise.  To generate a mask value, see [`mask`].
    pub fn is_chunk_boundary(&self, mask: u64) -> bool {