M src/chunk.rs => src/chunk.rs +6 -4
@@ 1,4 1,7 @@
-use crate::{mask, Hasher, MASK_SIZE};
+use crate::{
+ mask::{self, MASK_SIZE},
+ Hasher,
+};
/// Chunking options.
#[derive(Debug, Clone, PartialEq, Eq)]
@@ 34,9 37,9 @@ impl Opts {
// than desired, we use `mask_l` to make finding the chunk boundary easier, in that is is
// more likely a chunk boundary is found.
assert!(n + norm_level <= MASK_SIZE);
- let mask_s = mask(n + norm_level);
+ let mask_s = mask::generate(n + norm_level);
assert!(n - norm_level >= 1);
- let mask_l = mask(n - norm_level);
+ let mask_l = mask::generate(n - norm_level);
Self { desired_len, min_len: 0, mask_s, mask_l }
}
@@ 124,7 127,6 @@ impl<'a, 't> Iterator for Iter<'a, 't> {
#[cfg(test)]
mod tests {
use super::*;
- use crate::MASK_SIZE;
use quickcheck_macros::quickcheck;
use std::cmp::max;
M src/lib.rs => src/lib.rs +1 -111
@@ 5,6 5,7 @@ use std::fmt::{self, Debug, Formatter};
use table::GEAR_TABLE;
pub mod chunk;
+mod mask;
mod table;
/// Gear hasher.
@@ 64,114 65,3 @@ impl<'t> Debug for Hasher<'t> {
f.debug_struct("Hasher").field("hash", &self.hash).finish()
}
}
-
-/// Size of the masks used by the Gear hash.
-const MASK_SIZE: usize = 48;
-
-/// Returns a mask that contains `n` effective bits (i.e: 1 bits) evenly distributed throughout the
-/// 48 most-significant bits. This follows the technique described in the FastCDC paper.
-///
-/// > [The masks] are empirically dervied values where the padded zero bits are almost evenly
-/// > distributedfor slightly higher deduplication ratio according to our large scale tests.
-///
-/// # Panics
-///
-/// Panics if `n` is outside the range `1..=48`.
-pub fn mask(n: usize) -> u64 {
- assert!((1..=MASK_SIZE).contains(&n));
-
- // Evenly distribute `n-1` effective bits. The most-significant bit must be a 1-bit, so we
- // only distribute `n-1` remaining 1-bits rather than `n` 1-bits.
- let mut ef_bits = [0u8; MASK_SIZE - 1];
- distribute(&mut ef_bits, n - 1);
-
- // Convert array of bits to a u64, ensuring that the most-significant bit is 1 and
- // left-shifting for the remaining 0-bits.
- let m = ef_bits.into_iter().fold(1, |acc, bit| (acc << 1) | bit as u64);
- m << (64 - MASK_SIZE)
-}
-
-/// Distribute `n` 1-bits throughout the array of `bits`.
-fn distribute(bits: &mut [u8], n: usize) {
- // Number of 0-bits we expect to have in the result.
- let n0 = bits.len() - n;
- // Count of 0-bits and 1-bits written to `bits`, respectively.
- let mut count = (1, 1);
-
- // Consider the ratio of 1-bits and 0-bits written to `bits`. For an array of length `L` with
- // `N` 1-bits to distribute, the ideal ratio of 1-bits to 0-bits is `L / N`. In the loop, we
- // compare the actual ratio with the ideal ratio and write a 1-bit if the actual < ideal,
- // otherwise we write a 0-bit. Note that the conditional expr is written to avoid divisions,
- // but it can be rewritten as `(count.1 / count.0) < (n / n0)`.
- for b in bits.iter_mut() {
- if count.1 * n0 < count.0 * n {
- count.1 += 1;
- *b = 1;
- } else {
- count.0 += 1;
- *b = 0;
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn valid_mask() {
- for n in 1..=MASK_SIZE {
- // Most-significant bit is 1.
- assert_eq!(mask(n) >> 63, 1);
- // Least-significant 16 bits are 0.
- assert_eq!(mask(n) & 0x0000_0000_0000_ffff, 0);
- // Contains the correct number of effective bits.
- assert_eq!(effective_bits(mask(n)), n);
- }
- }
-
- #[test]
- fn mask1() {
- // Should be 1 followed by 63 0's.
- assert_eq!(mask(1), 0x8000_0000_0000_0000);
- }
-
- #[test]
- fn mask13() {
- // Should be 1 followed by 12 evenly distributed 1's.
- assert_eq!(mask(13), 0x9111_1111_1111_0000);
- }
-
- #[test]
- fn mask48() {
- // Should be 48 1's followed by 16 0's.
- assert_eq!(mask(48), 0xffff_ffff_ffff_0000);
- }
-
- #[test]
- fn distribute_bits() {
- for n in 1..=MASK_SIZE {
- let mut arr = vec![0; MASK_SIZE];
- distribute(&mut arr, n);
- assert_eq!(arr.iter().filter(|b| **b == 1).count(), n);
- }
- }
-
- #[test]
- fn count_effective_bits() {
- assert_eq!(effective_bits(0b0000), 0);
- assert_eq!(effective_bits(0b0001), 1);
- assert_eq!(effective_bits(0b0110), 2);
- assert_eq!(effective_bits(0b1110), 3);
- }
-
- fn effective_bits(n: u64) -> usize {
- let mut mask = 0x1;
- let mut count = 0;
- for _ in 0..64 {
- count += usize::from((n & mask) > 0);
- mask <<= 1;
- }
- count
- }
-}
A src/mask.rs => src/mask.rs +110 -0
@@ 0,0 1,110 @@
+/// Size, in bits, of the masks used by in FastCDC.
+pub const MASK_SIZE: usize = 48;
+
+/// Returns a mask that contains `n` effective bits (i.e: 1 bits) evenly distributed throughout the
+/// 48 most-significant bits. This follows the technique described in the FastCDC paper.
+///
+/// > [The masks] are empirically dervied values where the padded zero bits are almost evenly
+/// > distributedfor slightly higher deduplication ratio according to our large scale tests.
+///
+/// # Panics
+///
+/// Panics if `n` is outside the range `1..=48`.
+pub fn generate(n: usize) -> u64 {
+ assert!((1..=MASK_SIZE).contains(&n));
+
+ // Evenly distribute `n-1` effective bits. The most-significant bit must be a 1-bit, so we
+ // only distribute `n-1` remaining 1-bits rather than `n` 1-bits.
+ let mut ef_bits = [0u8; MASK_SIZE - 1];
+ distribute(&mut ef_bits, n - 1);
+
+ // Convert array of bits to a u64, ensuring that the most-significant bit is 1 and
+ // left-shifting for the remaining 0-bits.
+ let m = ef_bits.into_iter().fold(1, |acc, bit| (acc << 1) | bit as u64);
+ m << (64 - MASK_SIZE)
+}
+
+/// Distribute `n` 1-bits throughout the array of `bits`.
+fn distribute(bits: &mut [u8], n: usize) {
+ // Number of 0-bits we expect to have in the result.
+ let n0 = bits.len() - n;
+ // Count of 0-bits and 1-bits written to `bits`, respectively.
+ let mut count = (1, 1);
+
+ // Consider the ratio of 1-bits and 0-bits written to `bits`. For an array of length `L` with
+ // `N` 1-bits to distribute, the ideal ratio of 1-bits to 0-bits is `L / N`. In the loop, we
+ // compare the actual ratio with the ideal ratio and write a 1-bit if the actual < ideal,
+ // otherwise we write a 0-bit. Note that the conditional expr is written to avoid divisions,
+ // but it can be rewritten as `(count.1 / count.0) < (n / n0)`.
+ for b in bits.iter_mut() {
+ if count.1 * n0 < count.0 * n {
+ count.1 += 1;
+ *b = 1;
+ } else {
+ count.0 += 1;
+ *b = 0;
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn valid_mask() {
+ for n in 1..=MASK_SIZE {
+ // Most-significant bit is 1.
+ assert_eq!(generate(n) >> 63, 1);
+ // Least-significant 16 bits are 0.
+ assert_eq!(generate(n) & 0x0000_0000_0000_ffff, 0);
+ // Contains the correct number of effective bits.
+ assert_eq!(effective_bits(generate(n)), n);
+ }
+ }
+
+ #[test]
+ fn generate1() {
+ // Should be 1 followed by 63 0's.
+ assert_eq!(generate(1), 0x8000_0000_0000_0000);
+ }
+
+ #[test]
+ fn generate13() {
+ // Should be 1 followed by 12 evenly distributed 1's.
+ assert_eq!(generate(13), 0x9111_1111_1111_0000);
+ }
+
+ #[test]
+ fn generate48() {
+ // Should be 48 1's followed by 16 0's.
+ assert_eq!(generate(48), 0xffff_ffff_ffff_0000);
+ }
+
+ #[test]
+ fn distribute_bits() {
+ for n in 1..=MASK_SIZE {
+ let mut arr = vec![0; MASK_SIZE];
+ distribute(&mut arr, n);
+ assert_eq!(arr.iter().filter(|b| **b == 1).count(), n);
+ }
+ }
+
+ #[test]
+ fn count_effective_bits() {
+ assert_eq!(effective_bits(0b0000), 0);
+ assert_eq!(effective_bits(0b0001), 1);
+ assert_eq!(effective_bits(0b0110), 2);
+ assert_eq!(effective_bits(0b1110), 3);
+ }
+
+ fn effective_bits(n: u64) -> usize {
+ let mut mask = 0x1;
+ let mut count = 0;
+ for _ in 0..64 {
+ count += usize::from((n & mask) > 0);
+ mask <<= 1;
+ }
+ count
+ }
+}