~joshleeb/fastcdc

33a7b88de08e81eae4d6d65f0901db2a4caf7dda — Josh Leeb-du Toit 1 year, 3 months ago d0cba74
Move mask generation to private module
3 files changed, 117 insertions(+), 115 deletions(-)

M src/chunk.rs
M src/lib.rs
A src/mask.rs
M src/chunk.rs => src/chunk.rs +6 -4
@@ 1,4 1,7 @@
use crate::{mask, Hasher, MASK_SIZE};
use crate::{
    mask::{self, MASK_SIZE},
    Hasher,
};

/// Chunking options.
#[derive(Debug, Clone, PartialEq, Eq)]


@@ 34,9 37,9 @@ impl Opts {
        // than desired, we use `mask_l` to make finding the chunk boundary easier, in that is is
        // more likely a chunk boundary is found.
        assert!(n + norm_level <= MASK_SIZE);
        let mask_s = mask(n + norm_level);
        let mask_s = mask::generate(n + norm_level);
        assert!(n - norm_level >= 1);
        let mask_l = mask(n - norm_level);
        let mask_l = mask::generate(n - norm_level);

        Self { desired_len, min_len: 0, mask_s, mask_l }
    }


@@ 124,7 127,6 @@ impl<'a, 't> Iterator for Iter<'a, 't> {
#[cfg(test)]
mod tests {
    use super::*;
    use crate::MASK_SIZE;
    use quickcheck_macros::quickcheck;
    use std::cmp::max;


M src/lib.rs => src/lib.rs +1 -111
@@ 5,6 5,7 @@ use std::fmt::{self, Debug, Formatter};
use table::GEAR_TABLE;

pub mod chunk;
mod mask;
mod table;

/// Gear hasher.


@@ 64,114 65,3 @@ impl<'t> Debug for Hasher<'t> {
        f.debug_struct("Hasher").field("hash", &self.hash).finish()
    }
}

/// Size of the masks used by the Gear hash.
const MASK_SIZE: usize = 48;

/// Returns a mask that contains `n` effective bits (i.e: 1 bits) evenly distributed throughout the
/// 48 most-significant bits.  This follows the technique described in the FastCDC paper.
///
/// > [The masks] are empirically dervied values where the padded zero bits are almost evenly
/// > distributedfor slightly higher deduplication ratio according to our large scale tests.
///
/// # Panics
///
/// Panics if `n` is outside the range `1..=48`.
pub fn mask(n: usize) -> u64 {
    assert!((1..=MASK_SIZE).contains(&n));

    // Evenly distribute `n-1` effective bits.  The most-significant bit must be a 1-bit, so we
    // only distribute `n-1` remaining 1-bits rather than `n` 1-bits.
    let mut ef_bits = [0u8; MASK_SIZE - 1];
    distribute(&mut ef_bits, n - 1);

    // Convert array of bits to a u64, ensuring that the most-significant bit is 1 and
    // left-shifting for the remaining 0-bits.
    let m = ef_bits.into_iter().fold(1, |acc, bit| (acc << 1) | bit as u64);
    m << (64 - MASK_SIZE)
}

/// Distribute `n` 1-bits throughout the array of `bits`.
fn distribute(bits: &mut [u8], n: usize) {
    // Number of 0-bits we expect to have in the result.
    let n0 = bits.len() - n;
    // Count of 0-bits and 1-bits written to `bits`, respectively.
    let mut count = (1, 1);

    // Consider the ratio of 1-bits and 0-bits written to `bits`.  For an array of length `L` with
    // `N` 1-bits to distribute, the ideal ratio of 1-bits to 0-bits is `L / N`.  In the loop, we
    // compare the actual ratio with the ideal ratio and write a 1-bit if the actual < ideal,
    // otherwise we write a 0-bit.  Note that the conditional expr is written to avoid divisions,
    // but it can be rewritten as `(count.1 / count.0) < (n / n0)`.
    for b in bits.iter_mut() {
        if count.1 * n0 < count.0 * n {
            count.1 += 1;
            *b = 1;
        } else {
            count.0 += 1;
            *b = 0;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn valid_mask() {
        for n in 1..=MASK_SIZE {
            // Most-significant bit is 1.
            assert_eq!(mask(n) >> 63, 1);
            // Least-significant 16 bits are 0.
            assert_eq!(mask(n) & 0x0000_0000_0000_ffff, 0);
            // Contains the correct number of effective bits.
            assert_eq!(effective_bits(mask(n)), n);
        }
    }

    #[test]
    fn mask1() {
        // Should be 1 followed by 63 0's.
        assert_eq!(mask(1), 0x8000_0000_0000_0000);
    }

    #[test]
    fn mask13() {
        // Should be 1 followed by 12 evenly distributed 1's.
        assert_eq!(mask(13), 0x9111_1111_1111_0000);
    }

    #[test]
    fn mask48() {
        // Should be 48 1's followed by 16 0's.
        assert_eq!(mask(48), 0xffff_ffff_ffff_0000);
    }

    #[test]
    fn distribute_bits() {
        for n in 1..=MASK_SIZE {
            let mut arr = vec![0; MASK_SIZE];
            distribute(&mut arr, n);
            assert_eq!(arr.iter().filter(|b| **b == 1).count(), n);
        }
    }

    #[test]
    fn count_effective_bits() {
        assert_eq!(effective_bits(0b0000), 0);
        assert_eq!(effective_bits(0b0001), 1);
        assert_eq!(effective_bits(0b0110), 2);
        assert_eq!(effective_bits(0b1110), 3);
    }

    fn effective_bits(n: u64) -> usize {
        let mut mask = 0x1;
        let mut count = 0;
        for _ in 0..64 {
            count += usize::from((n & mask) > 0);
            mask <<= 1;
        }
        count
    }
}

A src/mask.rs => src/mask.rs +110 -0
@@ 0,0 1,110 @@
/// Size, in bits, of the masks used by in FastCDC.
pub const MASK_SIZE: usize = 48;

/// Returns a mask that contains `n` effective bits (i.e: 1 bits) evenly distributed throughout the
/// 48 most-significant bits.  This follows the technique described in the FastCDC paper.
///
/// > [The masks] are empirically dervied values where the padded zero bits are almost evenly
/// > distributedfor slightly higher deduplication ratio according to our large scale tests.
///
/// # Panics
///
/// Panics if `n` is outside the range `1..=48`.
pub fn generate(n: usize) -> u64 {
    assert!((1..=MASK_SIZE).contains(&n));

    // Evenly distribute `n-1` effective bits.  The most-significant bit must be a 1-bit, so we
    // only distribute `n-1` remaining 1-bits rather than `n` 1-bits.
    let mut ef_bits = [0u8; MASK_SIZE - 1];
    distribute(&mut ef_bits, n - 1);

    // Convert array of bits to a u64, ensuring that the most-significant bit is 1 and
    // left-shifting for the remaining 0-bits.
    let m = ef_bits.into_iter().fold(1, |acc, bit| (acc << 1) | bit as u64);
    m << (64 - MASK_SIZE)
}

/// Distribute `n` 1-bits throughout the array of `bits`.
fn distribute(bits: &mut [u8], n: usize) {
    // Number of 0-bits we expect to have in the result.
    let n0 = bits.len() - n;
    // Count of 0-bits and 1-bits written to `bits`, respectively.
    let mut count = (1, 1);

    // Consider the ratio of 1-bits and 0-bits written to `bits`.  For an array of length `L` with
    // `N` 1-bits to distribute, the ideal ratio of 1-bits to 0-bits is `L / N`.  In the loop, we
    // compare the actual ratio with the ideal ratio and write a 1-bit if the actual < ideal,
    // otherwise we write a 0-bit.  Note that the conditional expr is written to avoid divisions,
    // but it can be rewritten as `(count.1 / count.0) < (n / n0)`.
    for b in bits.iter_mut() {
        if count.1 * n0 < count.0 * n {
            count.1 += 1;
            *b = 1;
        } else {
            count.0 += 1;
            *b = 0;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn valid_mask() {
        for n in 1..=MASK_SIZE {
            // Most-significant bit is 1.
            assert_eq!(generate(n) >> 63, 1);
            // Least-significant 16 bits are 0.
            assert_eq!(generate(n) & 0x0000_0000_0000_ffff, 0);
            // Contains the correct number of effective bits.
            assert_eq!(effective_bits(generate(n)), n);
        }
    }

    #[test]
    fn generate1() {
        // Should be 1 followed by 63 0's.
        assert_eq!(generate(1), 0x8000_0000_0000_0000);
    }

    #[test]
    fn generate13() {
        // Should be 1 followed by 12 evenly distributed 1's.
        assert_eq!(generate(13), 0x9111_1111_1111_0000);
    }

    #[test]
    fn generate48() {
        // Should be 48 1's followed by 16 0's.
        assert_eq!(generate(48), 0xffff_ffff_ffff_0000);
    }

    #[test]
    fn distribute_bits() {
        for n in 1..=MASK_SIZE {
            let mut arr = vec![0; MASK_SIZE];
            distribute(&mut arr, n);
            assert_eq!(arr.iter().filter(|b| **b == 1).count(), n);
        }
    }

    #[test]
    fn count_effective_bits() {
        assert_eq!(effective_bits(0b0000), 0);
        assert_eq!(effective_bits(0b0001), 1);
        assert_eq!(effective_bits(0b0110), 2);
        assert_eq!(effective_bits(0b1110), 3);
    }

    fn effective_bits(n: u64) -> usize {
        let mut mask = 0x1;
        let mut count = 0;
        for _ in 0..64 {
            count += usize::from((n & mask) > 0);
            mask <<= 1;
        }
        count
    }
}