@@ 0,0 1,189 @@
+use crate::{mask, Hasher, MASK_SIZE};
+
+/// Chunking options.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Opts {
+ /// Desired chunk length.
+ pub desired_len: usize,
+ /// Minimum chunk length.
+ pub min_len: usize,
+ /// Small mask, with an increased number of effective bits.
+ mask_s: u64,
+ /// Large mask, with a decreased number of effective bits.
+ mask_l: u64,
+}
+
+impl Opts {
+ /// Returns a new `Opts` given the desired chunk length, and chunking normalization level.
+ ///
+ /// # Panics
+ ///
+ /// If the masks derived from the `desired_len` and `norm_level` exceed the maximum mask size
+ /// of 48 bits. More specifically, if
+ ///
+ /// - log2(desired_len) + norm_level > 48; or
+ /// - log2(desired_len) - norm_level < 1.
+ pub fn new(desired_len: usize, norm_level: usize) -> Self {
+ let n = log2(desired_len);
+
+ // Determining the masks is based on the desired length, and the chunking normalization
+ // level. `mask_s` will contain more effective bits than `mask_l` (for `norm_level` > 0).
+ //
+ // When the prospective chunk len is smaller than the desired chunk len, `mask_s` is used
+ // to make finding the chunk boundary harder. When the prospective chunk len is greater
+ // than desired, we use `mask_l` to make finding the chunk boundary easier, in that is is
+ // more likely a chunk boundary is found.
+ assert!(n + norm_level <= MASK_SIZE);
+ let mask_s = mask(n + norm_level);
+ assert!(n - norm_level >= 1);
+ let mask_l = mask(n - norm_level);
+
+ Self { desired_len, min_len: 0, mask_s, mask_l }
+ }
+
+ /// Returns a new `Opts` with the minimum chunk length set.
+ ///
+ /// # Panics
+ ///
+ /// If the `min_len` >= `desired_len`.
+ pub fn with_min_len(self, min_len: usize) -> Self {
+ assert!(min_len < self.desired_len);
+ Self { min_len, ..self }
+ }
+}
+
+impl Default for Opts {
+ fn default() -> Self {
+ Self::new(4096, 2).with_min_len(1024)
+ }
+}
+
+/// Base-2 logarithm for usize integers.
+fn log2(val: usize) -> usize {
+ let fval = val as f64;
+ fval.log2().ceil() as usize
+}
+
+/// Chunk returned from the chunking iterator [`Iter`].
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Chunk {
+ pub hash: u64,
+ pub offset: usize,
+ pub len: usize,
+}
+
+/// An iterator over the chunks determined by [`Hasher`] chunk boundaries.
+pub struct Iter<'a, 't> {
+ hasher: &'a mut Hasher<'t>,
+ data: &'a [u8],
+ opts: Opts,
+ cursor: usize,
+}
+
+impl<'a, 't> Iter<'a, 't> {
+ /// Returns a new chunking iterator. This struct is created by the [`Hasher::chunks`] method.
+ /// See its documentation for more.
+ pub fn new(hasher: &'a mut Hasher<'t>, data: &'a [u8], opts: Opts) -> Self {
+ Self { hasher, data, opts, cursor: 0 }
+ }
+}
+
+impl<'a, 't> Iterator for Iter<'a, 't> {
+ type Item = Chunk;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.cursor == self.data.len() {
+ return None;
+ }
+
+ let Opts { desired_len, min_len, .. } = self.opts;
+ let offset = self.cursor;
+ while let Some(b) = self.data.get(self.cursor) {
+ self.hasher.consume(*b);
+ self.cursor += 1;
+
+ // Sub-minimum chunk cut-point skipping.
+ let len = self.cursor - offset;
+ if len < min_len {
+ continue;
+ }
+
+ // Normalized chunking to select which mask to use.
+ let m = if len < desired_len { self.opts.mask_s } else { self.opts.mask_l };
+ if self.hasher.is_chunk_boundary(m) {
+ break;
+ }
+ }
+ // NOTE(joshleeb): A possible optimization here would be to combine the final chunk with
+ // this chunk if it is below the minimum size. For example, rather than two chunks of
+ // |8192|12| we have a final chunk |8204|.
+ Some(Chunk { hash: self.hasher.digest(), offset, len: self.cursor - offset })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::MASK_SIZE;
+ use quickcheck_macros::quickcheck;
+ use std::cmp::max;
+
+ const DATA: [u8; 42] = [0; 42];
+
+ #[test]
+ fn default_opts() {
+ // Check that the default opts doesn't panic.
+ Opts::default();
+ }
+
+ #[test]
+ fn empty() {
+ let mut hasher = Hasher::default();
+ let mut chunks = Iter::new(&mut hasher, &[], opts(13));
+ assert!(chunks.next().is_none());
+ }
+
+ #[test]
+ fn remaining() {
+ let mut hasher = Hasher::default();
+ let chunks = Iter::new(&mut hasher, &DATA, opts(13));
+ assert_eq!(
+ chunks.collect::<Vec<_>>(),
+ [Chunk { hash: hasher.digest(), offset: 0, len: DATA.len() }]
+ );
+ }
+
+ #[quickcheck]
+ fn total_len(data: Vec<u8>, n: usize) {
+ let mut hasher = Hasher::default();
+ let chunks = hasher.chunks(&data, opts(n));
+
+ let len: usize = chunks.map(|c| c.len).sum();
+ assert_eq!(len, data.len());
+ }
+
+ #[quickcheck]
+ fn offset_contiguity(data: Vec<u8>, n: usize) {
+ let mut hasher = Hasher::default();
+ let chunks: Vec<_> = hasher.chunks(&data, opts(n)).collect();
+
+ if !chunks.is_empty() {
+ assert_eq!(chunks[0].offset, 0);
+ }
+ for cs in chunks.as_slice().windows(2) {
+ assert_eq!(cs[0].offset + cs[0].len, cs[1].offset);
+ }
+ }
+
+ #[quickcheck]
+ fn non_zero_len(data: Vec<u8>, n: usize) {
+ let mut hasher = Hasher::default();
+ let mut chunks = hasher.chunks(&data, opts(n));
+ assert!(chunks.all(|c| c.len > 0));
+ }
+
+ fn opts(mut n: usize) -> Opts {
+ n = max(2, n % MASK_SIZE);
+ Opts::new(2usize.pow(n as u32), 1)
+ }
+}
@@ 1,8 1,10 @@
+pub use chunk::Chunk;
pub use table::Table;
use std::fmt::{self, Debug, Formatter};
use table::GEAR_TABLE;
+pub mod chunk;
mod table;
/// Gear hasher.
@@ 39,6 41,11 @@ impl<'t> Hasher<'t> {
self.hash
}
+ /// Returns an iterator over the chunks determined by chunk boundaries.
+ pub fn chunks<'a>(&'a mut self, data: &'a [u8], opts: chunk::Opts) -> chunk::Iter<'a, 't> {
+ chunk::Iter::new(self, data, opts)
+ }
+
/// Returns true if the current digest is a chunk boundary with the `mask`, and false
/// otherwise. To generate a mask value, see [`mask`].
pub fn is_chunk_boundary(&self, mask: u64) -> bool {