~ntietz/isabella-db

669a0f01cd0c4280113b2a2edd1e9b725b60542a — Nicole Tietz-Sokolskaya 1 year, 3 months ago aa4d34a
Write basic bitmap and position index implementation along with design
doc for where things are headed.

This is an incomplete position index implementation which serves as the
foundation for what is needed going forward. It's primarily missing:

- nice iterator to consume results (todo with the consumer)
- metrics/stats as laid out in the doc

Signed-off-by: Nicholas Tietz-Sokolsky <me@ntietz.com>
Implements: https://todo.sr.ht/~ntietz/isabella-db/6
M .gitignore => .gitignore +5 -0
@@ 1,1 1,6 @@
target/

# data files
*.isa
*.isa.*
pgns/

M Cargo.lock => Cargo.lock +63 -0
@@ 41,6 41,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

[[package]]
name = "bitmap"
version = "0.1.0"
dependencies = [
 "serde",
 "thiserror",
]

[[package]]
name = "btoi"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 99,6 107,17 @@ dependencies = [
]

[[package]]
name = "getrandom"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
dependencies = [
 "cfg-if",
 "libc",
 "wasi",
]

[[package]]
name = "heck"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 118,8 137,10 @@ name = "isabella-db"
version = "0.1.0"
dependencies = [
 "bincode",
 "bitmap",
 "clap",
 "pgn",
 "rand",
 "rmp-serde",
 "serde",
 "shakmaty",


@@ 264,6 285,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"

[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 306,6 333,36 @@ dependencies = [
]

[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
 "libc",
 "rand_chacha",
 "rand_core",
]

[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
 "ppv-lite86",
 "rand_core",
]

[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
 "getrandom",
]

[[package]]
name = "regex"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 561,6 618,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"

M Cargo.toml => Cargo.toml +2 -1
@@ 1,6 1,6 @@
[workspace]

members = ["isabella", "pgn"]
members = ["bitmap", "isabella", "pgn"]

[workspace.dependencies]



@@ 13,3 13,4 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "std"] 
[profile.release]
debug = true
lto = "fat"
opt-level = 3

A bitmap/Cargo.toml => bitmap/Cargo.toml +10 -0
@@ 0,0 1,10 @@
[package]
name = "bitmap"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
serde = { workspace = true }
thiserror = { workspace = true }

A bitmap/src/dense.rs => bitmap/src/dense.rs +243 -0
@@ 0,0 1,243 @@
use std::ops::{BitAnd, BitOr, Not};

use super::util::div_with_rem;
use super::{BitmapError, ItemID};

/// DenseBitmap stores one bit for each record that's indexed. It has a fixed
/// size assigned at creation and cannot be resized.
#[derive(Debug, PartialEq, Eq)]
pub struct DenseBitmap {
    chunks: Vec<usize>,
    size: usize,
}

pub struct DenseBitmapIterator<'a> {
    bitmap: &'a DenseBitmap,
    item_id: usize,
    chunk: usize,
}
impl<'a> Iterator for DenseBitmapIterator<'a> {
    type Item = ItemID;

    fn next(&mut self) -> Option<Self::Item> {
        while (self.chunk & 1) == 0 && self.item_id < self.bitmap.size {
            self.chunk >>= 1;
            self.item_id += 1;

            if self.item_id % usize::BITS as usize == 0 {
                let chunk_id = self.item_id / usize::BITS as usize;
                self.chunk = self.bitmap.chunks.get(chunk_id).copied().unwrap_or(0);
                continue;
            }
        }

        let result = if self.item_id < self.bitmap.size {
            Some(self.item_id)
        } else {
            None
        };

        self.item_id += 1;
        self.chunk >>= 1;

        result
    }
}

impl DenseBitmap {
    pub fn of_size(size: usize) -> Self {
        let num_chunks = minimum_chunks(size, usize::BITS as usize);
        let chunks = vec![0; num_chunks];

        DenseBitmap { chunks, size }
    }

    pub fn size(&self) -> usize {
        self.size
    }

    pub fn set(&mut self, key: ItemID, value: bool) -> Result<(), BitmapError> {
        if key >= self.size {
            return Err(BitmapError::OutOfBounds(key, self.size));
        }
        let (chunk_idx, shift_amt) = div_with_rem(key, usize::BITS as usize);

        let chunk = self
            .chunks
            .get_mut(chunk_idx)
            .expect("bitmap should have enough chunks");

        if value {
            *chunk |= 1 << shift_amt;
        } else {
            *chunk &= !(1 << shift_amt);
        }

        Ok(())
    }

    pub fn get(&self, key: ItemID) -> Option<bool> {
        if key >= self.size {
            return None;
        }

        let (chunk_idx, shift_amt) = div_with_rem(key, usize::BITS as usize);

        self.chunks
            .get(chunk_idx)
            .map(|chunk| ((chunk >> shift_amt) & 1) != 0)
    }
}

impl From<(usize, Vec<usize>)> for DenseBitmap {
    fn from((size, chunks): (usize, Vec<usize>)) -> Self {
        DenseBitmap { chunks, size }
    }
}

impl BitAnd for &DenseBitmap {
    type Output = DenseBitmap;

    fn bitand(self, rhs: Self) -> Self::Output {
        let size = self.size.min(rhs.size);
        let num_chunks = minimum_chunks(size, usize::BITS as usize);
        let mut chunks = vec![0; num_chunks];

        for (id, chunk) in chunks.iter_mut().enumerate() {
            *chunk = self.chunks[id] & rhs.chunks[id];
        }

        DenseBitmap { size, chunks }
    }
}

impl BitOr for &DenseBitmap {
    type Output = DenseBitmap;

    fn bitor(self, rhs: Self) -> Self::Output {
        let size = self.size.min(rhs.size);
        let num_chunks = minimum_chunks(size, usize::BITS as usize);
        let mut chunks = vec![0; num_chunks];

        for (id, chunk) in chunks.iter_mut().enumerate() {
            *chunk = self.chunks[id] | rhs.chunks[id];
        }

        DenseBitmap { size, chunks }
    }
}

impl Not for &DenseBitmap {
    type Output = DenseBitmap;

    fn not(self) -> Self::Output {
        let chunks: Vec<usize> = self.chunks.iter().map(|chunk| !*chunk).collect();

        DenseBitmap {
            size: self.size,
            chunks,
        }
    }
}

/// Determines the minmum number of chunks to hold the specified elements.
///
/// # Examples
///
/// ```
///     assert_eq!(0, bitmap::dense::minimum_chunks(0, 16));
///     assert_eq!(1, bitmap::dense::minimum_chunks(1, 16));
///     assert_eq!(1, bitmap::dense::minimum_chunks(15, 16));
///     assert_eq!(1, bitmap::dense::minimum_chunks(16, 16));
///     assert_eq!(2, bitmap::dense::minimum_chunks(17, 16));
/// ```
pub fn minimum_chunks(size: usize, bits_per_chunk: usize) -> usize {
    (size + bits_per_chunk - 1) / bits_per_chunk
}

impl<'a> IntoIterator for &'a DenseBitmap {
    type Item = ItemID;
    type IntoIter = DenseBitmapIterator<'a>;

    fn into_iter(self) -> Self::IntoIter {
        DenseBitmapIterator {
            bitmap: self,
            item_id: 0,
            chunk: self.chunks.first().copied().unwrap_or(0),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn dense_bitmaps_default_to_0s() {
        let size: usize = 100;
        let bitmap = DenseBitmap::of_size(size);

        assert_eq!(bitmap.size(), size, "size should be set");
        for idx in 0..size {
            assert_eq!(bitmap.get(idx), Some(false), "failed at index {idx}");
        }
        assert_eq!(bitmap.get(size), None);
    }

    #[test]
    fn dense_bitmaps_can_get_and_set() {
        let size: usize = 4;
        let mut bitmap = DenseBitmap::of_size(size);

        bitmap.set(2, true).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(false));
        assert_eq!(bitmap.get(1), Some(false));
        assert_eq!(bitmap.get(2), Some(true));
        assert_eq!(bitmap.get(3), Some(false));

        bitmap.set(2, false).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(false));
        assert_eq!(bitmap.get(1), Some(false));
        assert_eq!(bitmap.get(2), Some(false));
        assert_eq!(bitmap.get(3), Some(false));

        bitmap.set(0, true).expect("should set the value");
        bitmap.set(3, true).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(true));
        assert_eq!(bitmap.get(1), Some(false));
        assert_eq!(bitmap.get(2), Some(false));
        assert_eq!(bitmap.get(3), Some(true));
    }

    #[test]
    fn dense_bitmap_bitwise_ops() {
        let a: DenseBitmap = (96, vec![15, 5]).into();
        let b: DenseBitmap = (96, vec![7, 6]).into();

        let expected_or: DenseBitmap = (96, vec![15, 7]).into();
        let expected_and: DenseBitmap = (96, vec![7, 4]).into();
        let expected_not: DenseBitmap =
            (96, vec![18446744073709551600, 18446744073709551610]).into();

        let or = &a | &b;
        let and = &a & &b;
        let not = !&a;

        assert_eq!(or, expected_or);
        assert_eq!(and, expected_and);
        assert_eq!(not, expected_not);
    }

    #[test]
    fn dense_bitmap_can_be_iterated() {
        let bitmap: DenseBitmap = (96, vec![41, 7]).into();
        let expected_bits = vec![0, 3, 5, 64, 65, 66];

        let bits: Vec<ItemID> = bitmap.into_iter().collect();

        assert_eq!(bits, expected_bits);
    }
}

A bitmap/src/lib.rs => bitmap/src/lib.rs +19 -0
@@ 0,0 1,19 @@
use thiserror::Error;

pub mod dense;
pub mod sparse;
pub mod util;

pub use dense::DenseBitmap;
pub use sparse::SparseBitmap;

pub type ItemID = usize;

#[derive(Debug, Error)]
pub enum BitmapError {
    #[error("out-of-bounds")]
    OutOfBounds(ItemID, usize),

    #[error("out-of-order")]
    SettingOutOfOrder,
}

A bitmap/src/sparse.rs => bitmap/src/sparse.rs +142 -0
@@ 0,0 1,142 @@
use serde::{Deserialize, Serialize};

use super::{BitmapError, ItemID};

#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct Run {
    pub length: u32,
}

/// SparseBitmap stores contiguous runs of the same value, rather than each bit
/// individually. It has a fixed size assigned at creation and cannot be
/// resized.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SparseBitmap {
    runs: Vec<Run>,
    size: usize,
    run_length: usize,
}

impl SparseBitmap {
    pub fn of_size(size: usize) -> Self {
        // initially, we only have one run, so everything is 0s.
        SparseBitmap {
            size,
            run_length: 0,
            runs: vec![Run { length: 0 }],
        }
    }

    pub fn size(&self) -> usize {
        self.size
    }

    /// Sets the bit at the specified position to be true. Must be called in
    /// order from the lowest bit to the highest bit (for convenience of
    /// implementation).
    pub fn set(&mut self, key: ItemID) -> Result<(), BitmapError> {
        if key >= self.size {
            return Err(BitmapError::OutOfBounds(key, self.size));
        } else if key + 1 == self.run_length {
            // skip attempts to set the most recent bit
            return Ok(());
        } else if key < self.run_length {
            return Err(BitmapError::SettingOutOfOrder);
        }

        let mut ends_with_0 = self.runs.len() % 2 == 1;
        let pad_by = key - self.run_length;

        if pad_by > 0 {
            if !ends_with_0 {
                self.runs.push(Run { length: 0 });
                ends_with_0 = true;
            }
            if let Some(run) = self.runs.last_mut() {
                run.length += pad_by as u32;
                self.run_length += pad_by;
            }
        }

        if ends_with_0 {
            self.runs.push(Run { length: 0 });
        }
        if let Some(run) = self.runs.last_mut() {
            run.length += 1;
            self.run_length += 1;
        }

        Ok(())
    }

    pub fn get(&self, key: ItemID) -> Option<bool> {
        if key >= self.size {
            return None;
        }

        let mut idx: usize = 0;
        let mut bit = false;

        for run in &self.runs {
            idx += run.length as usize;

            if key < idx {
                return Some(bit);
            }

            bit = !bit;
        }

        Some(false)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn sparse_bitmaps_default_to_0s() {
        let size = 100;
        let bitmap = SparseBitmap::of_size(size);

        assert_eq!(bitmap.size(), size, "size should be set");
        for idx in 0..size {
            assert_eq!(bitmap.get(idx), Some(false), "failed at index {idx}");
        }
        assert_eq!(bitmap.get(size), None);
    }

    #[test]
    fn sparse_bitmaps_can_get_and_set() {
        let size = 4;
        let mut bitmap = SparseBitmap::of_size(size);

        bitmap.set(0).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(true));
        assert_eq!(bitmap.get(1), Some(false));
        assert_eq!(bitmap.get(2), Some(false));
        assert_eq!(bitmap.get(3), Some(false));

        bitmap.set(2).expect("should set the value");
        bitmap.set(3).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(true));
        assert_eq!(bitmap.get(1), Some(false));
        assert_eq!(bitmap.get(2), Some(true));
        assert_eq!(bitmap.get(3), Some(true));
    }

    #[test]
    fn can_set_same_bit_twice() {
        let size = 2;
        let mut bitmap = SparseBitmap::of_size(size);

        bitmap.set(0).expect("should set the value");
        bitmap.set(0).expect("should set the value");

        assert_eq!(bitmap.get(0), Some(true));
        assert_eq!(bitmap.get(1), Some(false));
    }
}

A bitmap/src/util.rs => bitmap/src/util.rs +16 -0
@@ 0,0 1,16 @@
use std::ops::{Div, Rem};

/// Calculates the quotient and remainder for two positive integers.
///
/// # Examples
///
/// ```
///     assert_eq!((1, 2), bitmap::util::div_with_rem(7, 5));
///     assert_eq!((2, 0), bitmap::util::div_with_rem(8, 4));
/// ```
pub fn div_with_rem<T>(a: T, b: T) -> (T, T)
where
    T: Copy + Div<Output = T> + Rem<Output = T>,
{
    ((a / b), (a % b))
}

A docs/0002-position-index.md => docs/0002-position-index.md +100 -0
@@ 0,0 1,100 @@

This design document is a working document for the initial position index in
IsabellaDB. It reflects a point in time and was used for design purposes, but
will not be updated. Future design documents may supplant this one.

---

# Interface

The purpose of the position index is to allow us to ask the question: for this
position, in which other games has it occurred?

This will be used directly by the DB while executing queries. It won't have a
direct public-facing API.

The internal API will resemble:
```rust
struct PositionIndex;
impl PositionIndex {
    fn games_containing(position: &Chess) -> IndexIterator<GameID> {
        todo!()
    }
}
```
We  can use `Chess` (a position) as the input since it satisfies the traits we
need to implement this: it is `Hash` and `Eq`. We need both, because we may get
collisions from hashes, so we need to detect and avoid those.

We return an iterator so that the game IDs don't all have to be materialized
into a list before taking action on them. This is critical for two reasons:
- We may want to return partial results, so materializing everything is a waste
- The query system will compose iterators to compute final results


# Internals

The position index is an inverted index: It maps from positions to the games
containing those positions. For very sparse positions, this can be implemented
almost trivially using a hashmap from the position to a list of games which
contain that position. However, the early tree is going to be rather dense:
there are only a handful of popular first moves, so some will have millions of
games.

We can address this using a bitmap index. Storing the entire bitmap has
significant space savings over storing a list of IDs, but we can do even better
using a sparse bitmap. This can be encoded using run-length encoding [1]. For
bitmaps with large runs of the same value, this allows significant compression,
and bitmap operations can be performed while remaining in compressed form.

The inverted index thus will look something like this:
```rust
type PositionHash = u64;
struct PositionIndex {
    map: HashMap<PositionHash, SparseBitmap<GameID>>,
}
```
Retrieval from the inverted index should be quite straightforward, and if you
want to combine bitmaps to see where two positions have occurred, that can also
be done by intersecting bitmaps.

The bitmaps indexes themselves are where the fun and complexity will lie in
this implementation. There are a few bitmap encodings we can use:
- Dense: one bit is stored for each record indexed
- Run-length encoding: each run of consecutive equal values is stored as a
    tuple of (number, length); for a bitmap, this can reduce to just (length)
    as long as we fix the first value to be a 0 (and possibly have length 0),
    since a new run will only start when the value flips.

More complex encodings are possible and will be explored based on performance.

The bitmap interface will look something like:
```rust
trait Bitmap<Key> {
    /// bitmaps are a fixed size for now
    fn of_size(size: usize) -> Self;

    fn or(&self, other: Box<dyn Bitmap>) -> Self;
    fn and(&self, other: Box<dyn Bitmap>) -> Self;
    fn not(&self) -> Self;

    fn iter() -> Iterator<Key>;
}
```

Then we can have implementations like `SparseBitMap` and `DenseBitmap`, and
use them together.


# Metrics/Traces/Logs

We should focus on observing the following:

- Number of position hash collisions
- Total number of positions
- Distribution of number of positions per game
- Distribution of run lengths in Bitmaps
- Sizes of Bitmaps under different encodings


[1]: https://en.wikipedia.org/wiki/Run-length_encoding

M isabella/Cargo.toml => isabella/Cargo.toml +3 -0
@@ 9,9 9,12 @@ license = "AGPL-3.0-or-later"
[dependencies]
clap = { version = "4.0.18", features = ["derive"] }

bitmap = { path = "../bitmap" }
pgn = { path = "../pgn" }
smartstring = "1.0.1"

rand = "0.8.5"

rmp-serde = "1.1.1"
bincode = "1.3.3"


M isabella/src/bin/idb.rs => isabella/src/bin/idb.rs +23 -19
@@ 3,6 3,7 @@ use std::io::{Read, Write};

use clap::{Parser, Subcommand};
use isabella_db::db::GameDB;
use isabella_db::index::PositionIndex;
use pgn::load::PgnFile;
use tracing_subscriber::fmt::format::FmtSpan;
use tracing_subscriber::EnvFilter;


@@ 10,7 11,7 @@ use tracing_subscriber::EnvFilter;
#[derive(Parser, Debug)]
struct Args {
    #[arg(short, long)]
    filename: String,
    gamedb_filename: String,

    #[command(subcommand)]
    command: Commands,


@@ 18,10 19,8 @@ struct Args {

#[derive(Subcommand, Debug)]
enum Commands {
    Validate,
    FromPgn,
    Convert { outfilename: String },
    Load,
    Convert { pgn_filename: String },
    Index { index_filename: String },
}

fn main() {


@@ 33,17 32,8 @@ fn main() {
    let args = Args::parse();

    match args.command {
        Commands::Validate => {
            let file = PgnFile::new(args.filename).expect("should open the file");
            println!("validated {} games", file.count());
        }
        Commands::FromPgn => {
            let file = PgnFile::new(args.filename).expect("should open the file");
            let db = GameDB::from_pgn(file);
            println!("loaded {} games", db.len());
        }
        Commands::Convert { outfilename } => {
            let file = PgnFile::new(args.filename).expect("should open the file");
        Commands::Convert { pgn_filename } => {
            let file = PgnFile::new(pgn_filename).expect("should open the file");
            let db = GameDB::from_pgn(file);

            let buf = bincode::serialize(&db).expect("serializing should work");


@@ 51,17 41,31 @@ fn main() {
            let mut outfile: File = File::options()
                .write(true)
                .create(true)
                .open(outfilename)
                .open(args.gamedb_filename)
                .expect("should open file to write");
            outfile.write_all(&buf).expect("should write to file");
        }
        Commands::Load => {
            let mut file: File = File::open(args.filename).expect("should open the file");
        Commands::Index { index_filename } => {
            let mut file: File = File::open(args.gamedb_filename).expect("should open the file");
            let mut buf: Vec<u8> = Vec::new();
            file.read_to_end(&mut buf).expect("should read");

            let db: GameDB = bincode::deserialize(&buf).expect("deserializing should work");
            println!("loaded {} games", db.len());
            drop(buf);
            drop(file);
            println!("dropped buf and file");

            let index = PositionIndex::load(&db);

            let buf = bincode::serialize(&index).expect("serializing should work");

            let mut outfile: File = File::options()
                .write(true)
                .create(true)
                .open(index_filename)
                .expect("should open file to write");
            outfile.write_all(&buf).expect("should write to file");
        }
    };
}

M isabella/src/db/mod.rs => isabella/src/db/mod.rs +4 -0
@@ 21,6 21,10 @@ impl GameDB {
        }
    }

    pub fn games(&self) -> &[Game] {
        &self.games
    }

    pub fn len(&self) -> usize {
        self.games.len()
    }

A isabella/src/index/mod.rs => isabella/src/index/mod.rs +4 -0
@@ 0,0 1,4 @@
pub mod position;
pub mod unique_fixed;

pub use position::PositionIndex;

A isabella/src/index/position.rs => isabella/src/index/position.rs +83 -0
@@ 0,0 1,83 @@
use std::collections::HashMap;

use bitmap::SparseBitmap;
use shakmaty::{zobrist::ZobristHash, Chess, Position};

use super::unique_fixed::UniqueFixedIndex;
use crate::db::GameDB;
use crate::game::StartingPosition;

const AVG_MOVES_PER_GAME: usize = 20;

/// We use a 128-bit Zobrist hash for collision resistance. A 64-bit hash should
/// suffice in most situations (up to 4B positions before a collision is
/// expected) but erring on the side of caution and loading lots of data.
pub type BoardHash = u64;

/// PositionIndex is a UniqueFixedIndex which contains Zobrist hashes of
/// positions and sparse bitmaps of the games containing these positions.
pub type PositionIndex = UniqueFixedIndex<BoardHash, SparseBitmap>;

impl PositionIndex {
    pub fn load(db: &GameDB) -> PositionIndex {
        let init_size_guess = db.games().len() * AVG_MOVES_PER_GAME / 2;
        tracing::info!(init_size_guess, "initializing positions");

        let mut positions: HashMap<BoardHash, SparseBitmap> =
            HashMap::with_capacity(init_size_guess);

        let num_games = db.games().len();

        for (idx, game) in db.games().iter().enumerate() {
            let mut pos = match &game.starting_position {
                StartingPosition::Standard => Chess::default(),
                StartingPosition::Custom(pos) => pos.clone(),
            };

            let zhash = pos.zobrist_hash();
            let bmap = positions
                .entry(zhash)
                .or_insert_with(|| SparseBitmap::of_size(num_games));
            bmap.set(idx)
                .expect("position {idx} should be able to be set");

            for m in game.moves.iter() {
                pos.play_unchecked(m);

                let zhash = pos.zobrist_hash();
                let bmap = positions
                    .entry(zhash)
                    .or_insert_with(|| SparseBitmap::of_size(num_games));
                bmap.set(idx)
                    .expect("position {idx} should be able to be set");
            }

            if idx % 100_000 == 0 {
                tracing::debug!(idx, positions = positions.len(), "loading in progress");
            }
        }

        tracing::debug!(len = positions.len(), "built positions hashmap");

        let mut keys: Vec<BoardHash> = Vec::with_capacity(positions.len());
        let mut vals: Vec<SparseBitmap> = Vec::with_capacity(positions.len());

        let mut kvs: Vec<(BoardHash, SparseBitmap)> = positions.drain().collect();
        tracing::debug!("built kvs");
        drop(positions);
        tracing::debug!("dropped positions");

        kvs.sort_by_key(|t| t.0);
        tracing::debug!("sorted kvs");
        for (key, val) in kvs.into_iter() {
            keys.push(key);
            vals.push(val);
        }
        tracing::debug!("moved to keys/vals");

        UniqueFixedIndex::from_dedup_vecs(keys, vals)
    }
}

#[cfg(test)]
mod tests {}

A isabella/src/index/unique_fixed.rs => isabella/src/index/unique_fixed.rs +79 -0
@@ 0,0 1,79 @@
use serde::{Deserialize, Serialize};

/// UniqueFixedIndex is a fixed-size container allowing locating a given value
/// by key, or the presence/absence of a key.
///
/// It must be initialized from the full dataset at initialization time and
/// cannot be inserted into later.
///
/// keys and values are stored separately so that we can have better cache hits
/// while searching for key matches before indexing into the vals vec to
/// retrieve the searched-for value.
#[derive(Serialize, Deserialize)]
pub struct UniqueFixedIndex<K, V> {
    keys: Vec<K>,
    vals: Vec<V>,
}

impl<K, V> UniqueFixedIndex<K, V>
where
    K: Ord,
{
    /// Create a UniqueFixedIndex from key and value Vecs. This takes ownership
    /// of the provided data so it does not require any additional memory
    /// allocation during the construction of the UFI.
    ///
    /// `keys` MUST be sorted and deduplicated.
    ///
    /// Panics if keys and vals are not the same length!
    pub fn from_dedup_vecs(mut keys: Vec<K>, mut vals: Vec<V>) -> Self {
        assert!(
            keys.len() == vals.len(),
            "keys and values should be the same size"
        );

        // ensure the vecs are as small as possible since we will never insert
        // into them.
        keys.shrink_to_fit();
        vals.shrink_to_fit();

        UniqueFixedIndex { keys, vals }
    }

    /// Retrieve the value which matches the given key, if any.
    pub fn get(&self, key: &K) -> Option<&V> {
        match self.keys.binary_search(key) {
            Ok(index) => self.vals.get(index),
            Err(_) => None,
        }
    }

    /// Checks whether the key is contained in the UFI or not.
    pub fn contains(&self, key: &K) -> bool {
        self.keys.binary_search(key).is_ok()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn ufi_can_find_elems() {
        let keys = vec![1, 2, 3, 4, 5];
        let vals = vec![10, 20, 30, 40, 50];

        let ufi = UniqueFixedIndex::from_dedup_vecs(keys, vals);

        assert_eq!(ufi.keys.len(), 5);
        assert_eq!(ufi.vals.len(), 5);
        assert_eq!(ufi.keys.capacity(), 5);
        assert_eq!(ufi.vals.capacity(), 5);

        assert!(ufi.contains(&1));
        assert!(!ufi.contains(&10));

        assert_eq!(ufi.get(&4), Some(&40));
        assert_eq!(ufi.get(&40), None);
    }
}

M isabella/src/lib.rs => isabella/src/lib.rs +1 -0
@@ 1,3 1,4 @@
pub mod db;
pub mod game;
pub mod index;
pub mod strings;