~ntietz/isabella-db

c12bccb222ab4356ac1acc8c3a32ff146a7c68fd — Nicole Tietz-Sokolskaya 1 year, 3 months ago 25b04e0
Switch to new PGN parser and implement to-disk serialization.

This makes it so that we can read in the full Caissabase file without
any errors. We can also save the database back in a Messagepack format.
That format is pretty slow to save and slower to read, so we will want
to do something different; I think some manual binary packing of a
format is in order.
18 files changed, 637 insertions(+), 633 deletions(-)

M Cargo.lock
M Cargo.toml
M isabella/Cargo.toml
M isabella/src/bin/idb.rs
A isabella/src/db/mod.rs
A isabella/src/game/mod.rs
A isabella/src/game/serde_proxy.rs
M isabella/src/lib.rs
A isabella/src/strings.rs
M pgn/Cargo.toml
M pgn/src/lib.rs
A pgn/src/load/mod.rs
D pgn/src/parse/file.rs
D pgn/src/parse/mod.rs
D pgn/src/parse/moves.rs
D pgn/src/parse/san.rs
D pgn/src/parse/tag.rs
D pgn/src/parse/util.rs
M Cargo.lock => Cargo.lock +165 -0
@@ 3,6 3,12 @@
version = 3

[[package]]
name = "arrayvec"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"

[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 26,6 32,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

[[package]]
name = "btoi"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97c0869a9faa81f8bbf8102371105d6d0a7b79167a04c340b04ab16892246a11"
dependencies = [
 "num-traits",
]

[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"

[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 89,6 110,10 @@ version = "0.1.0"
dependencies = [
 "clap",
 "pgn",
 "rmp-serde",
 "serde",
 "shakmaty",
 "smartstring",
 "tracing",
 "tracing-subscriber",
]


@@ 115,6 140,24 @@ dependencies = [
]

[[package]]
name = "mach"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa"
dependencies = [
 "libc",
]

[[package]]
name = "matchers"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
dependencies = [
 "regex-automata",
]

[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 147,6 190,15 @@ dependencies = [
]

[[package]]
name = "num-traits"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
 "autocfg",
]

[[package]]
name = "once_cell"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 165,16 217,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"

[[package]]
name = "paste"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1de2e551fb905ac83f73f7aedf2f0cb4a0da7e35efa24a202a936269f1f18e1"

[[package]]
name = "pgn"
version = "0.1.0"
dependencies = [
 "nom",
 "pgn-reader",
 "serde",
 "shakmaty",
 "smartstring",
 "thiserror",
 "tracing",
]

[[package]]
name = "pgn-reader"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bda82b38f84e44927cfefa8588cd1e08cef46430aa3b9a0d4f863fefa8d44dc4"
dependencies = [
 "btoi",
 "memchr",
 "shakmaty",
 "slice-deque",
]

[[package]]
name = "pin-project-lite"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 223,6 296,83 @@ dependencies = [
]

[[package]]
name = "regex"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
dependencies = [
 "regex-syntax",
]

[[package]]
name = "regex-automata"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
dependencies = [
 "regex-syntax",
]

[[package]]
name = "regex-syntax"
version = "0.6.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"

[[package]]
name = "rmp"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44519172358fd6d58656c86ab8e7fbc9e1490c3e8f14d35ed78ca0dd07403c9f"
dependencies = [
 "byteorder",
 "num-traits",
 "paste",
]

[[package]]
name = "rmp-serde"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5b13be192e0220b8afb7222aa5813cb62cc269ebb5cac346ca6487681d2913e"
dependencies = [
 "byteorder",
 "rmp",
 "serde",
]

[[package]]
name = "serde"
version = "1.0.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965"
dependencies = [
 "serde_derive",
]

[[package]]
name = "serde_derive"
version = "1.0.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852"
dependencies = [
 "proc-macro2",
 "quote",
 "syn",
]

[[package]]
name = "shakmaty"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c21d64aa05e370f386703bd46acdfd13d04f202fb4ce365cc67bbf744a72003"
dependencies = [
 "arrayvec",
 "bitflags",
 "btoi",
]

[[package]]
name = "sharded-slab"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 232,6 382,17 @@ dependencies = [
]

[[package]]
name = "slice-deque"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31ef6ee280cdefba6d2d0b4b78a84a1c1a3f3a4cec98c2d4231c8bc225de0f25"
dependencies = [
 "libc",
 "mach",
 "winapi",
]

[[package]]
name = "smallvec"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"


@@ 359,10 520,14 @@ version = "0.3.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70"
dependencies = [
 "matchers",
 "nu-ansi-term",
 "once_cell",
 "regex",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
]

M Cargo.toml => Cargo.toml +3 -1
@@ 4,9 4,11 @@ members = ["isabella", "pgn"]

[workspace.dependencies]

serde = { version = "1.0", features = ["derive"] }
shakmaty = "0.22.0"
thiserror = "1.0.37"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["fmt", "std"] }
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "std"] }

[profile.release]
debug = true

M isabella/Cargo.toml => isabella/Cargo.toml +5 -0
@@ 10,6 10,11 @@ license = "AGPL-3.0-or-later"
clap = { version = "4.0.18", features = ["derive"] }

pgn = { path = "../pgn" }
smartstring = "1.0.1"

rmp-serde = "1.1.1"

serde = { workspace = true }
shakmaty = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }

M isabella/src/bin/idb.rs => isabella/src/bin/idb.rs +41 -28
@@ 1,5 1,12 @@
use std::fs::File;
use std::io::{BufReader, BufWriter};

use clap::{Parser, Subcommand};
use pgn::{load_file, validate_file};
use isabella_db::db::GameDB;
use pgn::load::PgnFile;
use serde::Serialize;
use tracing_subscriber::fmt::format::FmtSpan;
use tracing_subscriber::EnvFilter;

#[derive(Parser, Debug)]
struct Args {


@@ 13,43 20,49 @@ struct Args {
#[derive(Subcommand, Debug)]
enum Commands {
    Validate,
    FromPgn,
    Convert { outfilename: String },
    Load,
}

fn main() {
    tracing_subscriber::fmt()
        .with_env_filter(EnvFilter::from_default_env())
        .with_span_events(FmtSpan::ACTIVE)
        .init();

    let args = Args::parse();

    match args.command {
        Commands::Validate => {
            let result = validate_file(args.filename).expect("should load the file");

            for (buf, err) in &result.failures {
                println!("{buf}");
                println!("----------");
                println!("{err}");
                println!();
            }

            let failure_count = result.failures.len();
            let success_count = result.successful;
            println!(
                "{} successes / {} failures ({:.2}%)",
                success_count,
                failure_count,
                (failure_count as f32 / (failure_count + success_count) as f32 * 100.0)
            );
            let file = PgnFile::new(args.filename).expect("should open the file");
            println!("validated {} games", file.count());
        }
        Commands::FromPgn => {
            let file = PgnFile::new(args.filename).expect("should open the file");
            let db = GameDB::from_pgn(file);
            println!("loaded {} games", db.len());
        }
        Commands::Convert { outfilename } => {
            let file = PgnFile::new(args.filename).expect("should open the file");
            let db = GameDB::from_pgn(file);

            let outfile: File = File::options()
                .write(true)
                .create(true)
                .open(outfilename)
                .expect("should open file to write");
            let mut writer = BufWriter::new(outfile);
            let mut serializer = rmp_serde::Serializer::new(&mut writer);

            db.serialize(&mut serializer).expect("should serialize");
        }
        Commands::Load => {
            let result = load_file(args.filename).expect("should load the file");

            let failure_count = result.failures;
            let success_count = result.games.len();
            println!(
                "{} successes / {} failures ({:.2}%)",
                success_count,
                failure_count,
                (failure_count as f32 / (failure_count + success_count) as f32 * 100.0)
            );
            let file: File = File::open(args.filename).expect("should open the file");
            let mut reader = BufReader::new(file);

            let db: GameDB = rmp_serde::from_read(&mut reader).expect("should read teh file back");
            println!("loaded {} games", db.len());
        }
    };
}

A isabella/src/db/mod.rs => isabella/src/db/mod.rs +61 -0
@@ 0,0 1,61 @@
use pgn::PgnFile;
use serde::{Deserialize, Serialize};
use shakmaty::Chess;

use crate::game::{Game, StartingPosition};
use crate::strings::StringsTable;

const TRACE_CHUNK_SIZE: usize = 500_000;

#[derive(Default, Serialize, Deserialize)]
pub struct GameDB {
    games: Vec<Game>,
    strings: StringsTable,
}

impl GameDB {
    pub fn new() -> Self {
        GameDB {
            games: vec![],
            strings: StringsTable::new(),
        }
    }

    pub fn len(&self) -> usize {
        self.games.len()
    }

    pub fn is_empty(&self) -> bool {
        self.games.is_empty()
    }

    pub fn by_id(&self, id: usize) -> Option<&Game> {
        self.games.get(id)
    }

    pub fn from_pgn(f: PgnFile) -> Self {
        let mut db = GameDB::new();

        let standard_start = Chess::default();

        for record in f {
            let mut game = Game::default();
            if record.starting_position != standard_start {
                game.starting_position = StartingPosition::Custom(record.starting_position);
            }
            game.moves = record.moves;
            for (key, value) in record.tags {
                let kid = db.strings.insert(key);
                let vid = db.strings.insert(value);
                game.tags.push((kid, vid));
            }
            db.games.push(game);

            if db.games.len() % TRACE_CHUNK_SIZE == 0 {
                tracing::info!(count = db.games.len(), "loading games");
            }
        }

        db
    }
}

A isabella/src/game/mod.rs => isabella/src/game/mod.rs +23 -0
@@ 0,0 1,23 @@
use serde::{Deserialize, Serialize};
use shakmaty::{Chess, Move};

use crate::strings::StringID;

mod serde_proxy;

use serde_proxy::{chess_serde, vec_move_def};

#[derive(Debug, Default, Serialize, Deserialize)]
pub enum StartingPosition {
    #[default]
    Standard,
    Custom(#[serde(with = "chess_serde")] Chess),
}

#[derive(Default, Serialize, Deserialize)]
pub struct Game {
    pub starting_position: StartingPosition,
    #[serde(default, with = "vec_move_def")]
    pub moves: Vec<Move>,
    pub tags: Vec<(StringID, StringID)>,
}

A isabella/src/game/serde_proxy.rs => isabella/src/game/serde_proxy.rs +166 -0
@@ 0,0 1,166 @@
use serde::{Deserialize, Serialize};
use shakmaty::{Move, Role, Square};

#[derive(Serialize, Deserialize)]
#[serde(remote = "shakmaty::Role")]
pub enum RoleDef {
    Pawn = 1,
    Knight = 2,
    Bishop = 3,
    Rook = 4,
    Queen = 5,
    King = 6,
}

pub mod opt_role_def {
    use serde::{Deserialize, Deserializer, Serialize, Serializer};

    use super::{Role, RoleDef};

    pub fn serialize<S>(value: &Option<Role>, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        #[derive(Serialize)]
        struct Helper<'a>(#[serde(with = "RoleDef")] &'a Role);

        value.as_ref().map(Helper).serialize(serializer)
    }

    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Role>, D::Error>
    where
        D: Deserializer<'de>,
    {
        #[derive(Deserialize)]
        struct Helper(#[serde(with = "RoleDef")] Role);

        let helper = Option::deserialize(deserializer)?;
        Ok(helper.map(|Helper(external)| external))
    }
}

#[rustfmt::skip]
#[derive(Serialize, Deserialize)]
#[serde(remote = "shakmaty::Square")]
#[repr(u8)]
pub enum SquareDef {
    A1 = 0, B1, C1, D1, E1, F1, G1, H1,
    A2, B2, C2, D2, E2, F2, G2, H2,
    A3, B3, C3, D3, E3, F3, G3, H3,
    A4, B4, C4, D4, E4, F4, G4, H4,
    A5, B5, C5, D5, E5, F5, G5, H5,
    A6, B6, C6, D6, E6, F6, G6, H6,
    A7, B7, C7, D7, E7, F7, G7, H7,
    A8, B8, C8, D8, E8, F8, G8, H8,
}

#[derive(Serialize, Deserialize)]
#[serde(remote = "shakmaty::Move")]
enum MoveDef {
    Normal {
        #[serde(with = "RoleDef")]
        role: Role,
        #[serde(with = "SquareDef")]
        from: Square,
        #[serde(default, with = "opt_role_def")]
        capture: Option<Role>,
        #[serde(with = "SquareDef")]
        to: Square,
        #[serde(default, with = "opt_role_def")]
        promotion: Option<Role>,
    },
    EnPassant {
        #[serde(with = "SquareDef")]
        from: Square,
        #[serde(with = "SquareDef")]
        to: Square,
    },
    Castle {
        #[serde(with = "SquareDef")]
        king: Square,
        #[serde(with = "SquareDef")]
        rook: Square,
    },
    Put {
        #[serde(with = "RoleDef")]
        role: Role,
        #[serde(with = "SquareDef")]
        to: Square,
    },
}

pub mod vec_move_def {
    use serde::{Deserialize, Deserializer, Serialize, Serializer};

    use super::{Move, MoveDef};

    pub fn serialize<S>(value: &[Move], serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        #[derive(Serialize)]
        struct Helper<'a>(#[serde(with = "MoveDef")] &'a Move);

        let helpers: Vec<Helper> = value.iter().map(Helper).collect();
        helpers.serialize(serializer)
    }

    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Move>, D::Error>
    where
        D: Deserializer<'de>,
    {
        #[derive(Deserialize)]
        struct Helper(#[serde(with = "MoveDef")] Move);

        let helpers: Vec<Helper> = Vec::deserialize(deserializer)?;
        let remotes: Vec<Move> = helpers
            .iter()
            .map(|Helper(external)| external.clone())
            .collect();
        Ok(remotes)
    }
}

pub mod chess_serde {
    use serde::{de::Visitor, Deserializer, Serializer};
    use shakmaty::{fen::Fen, CastlingMode, Chess, EnPassantMode};

    pub fn serialize<S>(value: &Chess, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let fen = Fen::from_position(value.clone(), EnPassantMode::Always);
        serializer.serialize_str(&fen.to_string())
    }

    struct FenVisitor;
    impl<'de> Visitor<'de> for FenVisitor {
        type Value = Chess;

        fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
            formatter.write_str("a valid FEN string")
        }

        fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
        where
            E: serde::de::Error,
        {
            let fen: Fen = match value.parse() {
                Ok(fen) => fen,
                Err(e) => return Err(E::custom(format!("{e:?}"))),
            };
            let pos: Chess = match fen.into_position(CastlingMode::Standard) {
                Ok(pos) => pos,
                Err(e) => return Err(E::custom(format!("{e:?}"))),
            };
            Ok(pos)
        }
    }

    pub fn deserialize<'de, D>(deserializer: D) -> Result<Chess, D::Error>
    where
        D: Deserializer<'de>,
    {
        deserializer.deserialize_str(FenVisitor)
    }
}

M isabella/src/lib.rs => isabella/src/lib.rs +3 -14
@@ 1,14 1,3 @@
pub fn add(left: usize, right: usize) -> usize {
    left + right
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn it_works() {
        let result = add(2, 2);
        assert_eq!(result, 4);
    }
}
pub mod db;
pub mod game;
pub mod strings;

A isabella/src/strings.rs => isabella/src/strings.rs +85 -0
@@ 0,0 1,85 @@
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};

use serde::{Deserialize, Serialize};

/// to save memory, we use a 32-bit int. this should be fine as long as we don't
/// store more than 4_294_967_295 strings.
pub type StringID = u32;

pub type StringHash = u64;

#[derive(Default, Deserialize, Serialize)]
pub struct StringsTable {
    strings: Vec<String>,
    ids: HashMap<StringHash, StringID>,
}

impl StringsTable {
    /// create a new StringsTable with no default capacity
    pub fn new() -> Self {
        StringsTable {
            strings: vec![],
            ids: HashMap::new(),
        }
    }

    /// retrieves the string stored with an id, if it exists
    pub fn by_id(&self, id: StringID) -> Option<&String> {
        self.strings.get(id as usize)
    }

    /// retrieves the string by its hash, if it exists
    pub fn by_hash(&self, hash: StringHash) -> Option<&String> {
        if let Some(id) = self.ids.get(&hash) {
            self.by_id(*id)
        } else {
            None
        }
    }

    /// inserts the string into the table if it doesn't exist, and returns the
    /// id of the string
    pub fn insert<S: AsRef<str>>(&mut self, s: S) -> StringID {
        let hash = generate_hash(&s);
        let next_id = self.strings.len() as u32;

        let id = self.ids.entry(hash).or_insert(next_id);

        if *id == next_id {
            self.strings.push(s.as_ref().into());
        }

        *id
    }

    /// returns true if the offered string is contained in the map
    pub fn contains<S: AsRef<str>>(&self, s: S) -> bool {
        let hash = generate_hash(&s);
        self.ids.contains_key(&hash)
    }
}

fn generate_hash<S: AsRef<str>>(s: S) -> StringHash {
    let mut h = DefaultHasher::new();
    s.as_ref().hash(&mut h);
    h.finish()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strings_are_inserted() {
        let s = "my string";
        let mut table = StringsTable::new();

        assert!(!table.contains(s), "should not contain string");
        assert_eq!(table.insert(s), 0, "first should insert");
        assert!(table.contains(s), "should contain string");
        assert_eq!(table.insert(s), 0, "second should dedup");
        assert!(table.contains(s), "should still contain string");
    }
}

M pgn/Cargo.toml => pgn/Cargo.toml +4 -0
@@ 12,5 12,9 @@ license = "MPL-2.0"
nom = "7.1.1"
smartstring = "1.0.1"

pgn-reader = "0.21.0"

serde = { workspace = true }
shakmaty = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }

M pgn/src/lib.rs => pgn/src/lib.rs +2 -6
@@ 1,9 1,5 @@
#![feature(let_chains)]
#![feature(is_some_and)]
pub mod parse;
pub mod load;

pub use parse::file::{load_file, validate_file};
pub use parse::parse_pgn;
pub use parse::san::SANString;
pub use parse::GameRecord;
pub use parse::ParseError;
pub use load::{PgnFile, PgnRecord};

A pgn/src/load/mod.rs => pgn/src/load/mod.rs +79 -0
@@ 0,0 1,79 @@
use std::{fs::File, path::Path};

use pgn_reader::{BufferedReader, RawHeader, SanPlus, Skip, Visitor};
use shakmaty::{fen::Fen, CastlingMode, Chess, Move, Position};

#[derive(Debug, Default)]
pub struct PgnRecord {
    pub starting_position: Chess,
    pub ending_position: Chess,
    pub moves: Vec<Move>,
    pub tags: Vec<(String, String)>,
}

#[derive(Debug, Default)]
struct Loader {
    game_record: PgnRecord,
}

impl Visitor for Loader {
    type Result = PgnRecord;

    fn header(&mut self, key: &[u8], value: RawHeader<'_>) {
        if key == b"FEN" {
            let pos: Option<Chess> = Fen::from_ascii(value.as_bytes())
                .ok()
                .and_then(|f| f.into_position(CastlingMode::Standard).ok());

            if let Some(pos) = pos {
                self.game_record.starting_position = pos.clone();
                self.game_record.ending_position = pos;
            }
        }

        self.game_record.tags.push((
            String::from_utf8_lossy(key).to_string(),
            String::from_utf8_lossy(value.as_bytes()).to_string(),
        ));
    }

    fn begin_variation(&mut self) -> Skip {
        Skip(true)
    }

    fn san(&mut self, san_plus: SanPlus) {
        if let Ok(m) = san_plus.san.to_move(&self.game_record.ending_position) {
            // the move is already validated by parsing from SAN
            self.game_record.ending_position.play_unchecked(&m);
            self.game_record.moves.push(m);
        }
    }

    fn end_game(&mut self) -> Self::Result {
        std::mem::take(&mut self.game_record)
    }
}

pub struct PgnFile {
    visitor: Loader,
    reader: BufferedReader<File>,
}

impl PgnFile {
    pub fn new<P: AsRef<Path>>(filename: P) -> Result<PgnFile, std::io::Error> {
        let f = File::open(filename)?;
        let reader = BufferedReader::new(f);

        let visitor = Loader::default();

        Ok(PgnFile { visitor, reader })
    }
}

impl Iterator for PgnFile {
    type Item = PgnRecord;

    fn next(&mut self) -> Option<Self::Item> {
        self.reader.read_game(&mut self.visitor).unwrap_or(None)
    }
}

D pgn/src/parse/file.rs => pgn/src/parse/file.rs +0 -122
@@ 1,122 0,0 @@
use std::io::BufRead;
use std::{fs::File, io::BufReader, path::Path};

use crate::{parse_pgn, GameRecord, ParseError};

pub struct PgnFile {
    reader: BufReader<File>,
    buf: String,
    eof: bool,
}

impl PgnFile {
    pub fn new<P: AsRef<Path>>(filename: P) -> Result<PgnFile, std::io::Error> {
        let f = File::open(filename)?;
        let reader = BufReader::new(f);

        let buf = String::with_capacity(10_240);

        Ok(PgnFile {
            reader,
            buf,
            eof: false,
        })
    }
}

impl Iterator for PgnFile {
    type Item = Result<GameRecord, (String, ParseError)>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.eof {
            return None;
        }

        let mut waiting_for = 2;
        while waiting_for > 0 && let Ok(n) = self.reader.read_line(&mut self.buf) {
            if n == 1 {
                waiting_for -= 1;
            } else if n == 0 {
                self.eof = true;
                break;
            }
        }

        let game = parse_pgn(&self.buf);

        let res = match game {
            Ok(g) => Ok(g),
            Err(e) => Err((self.buf.clone(), e)),
        };
        self.buf.clear();

        Some(res)
    }
}

#[derive(Debug, Default)]
pub struct LoadResult {
    pub games: Vec<GameRecord>,
    pub failures: usize,
}

#[derive(Debug, Default)]
pub struct ValidationResult {
    pub failures: Vec<(String, ParseError)>,
    pub successful: usize,
}

pub fn validate_file<P: AsRef<Path>>(filename: P) -> Result<ValidationResult, ParseError> {
    let pgn = PgnFile::new(filename)?;

    let mut result = ValidationResult {
        failures: vec![],
        successful: 0,
    };

    for each in pgn {
        match each {
            Err(failure) => {
                result.failures.push(failure);
            }
            Ok(_) => {
                result.successful += 1;
            }
        }
    }

    Ok(result)
}

pub fn load_file<P: AsRef<Path>>(filename: P) -> Result<LoadResult, ParseError> {
    let f = PgnFile::new(filename)?;

    let mut result = LoadResult {
        games: vec![],
        failures: 0,
    };

    for each in f {
        match each {
            Ok(game) => {
                result.games.push(game);
            }
            Err(_) => {
                result.failures += 1;
            }
        }
    }

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_file() {
        let result = load_file("./data/smol.pgn").expect("should read and parse file");
        assert_eq!(result.games.len(), 6);
    }
}

D pgn/src/parse/mod.rs => pgn/src/parse/mod.rs +0 -179
@@ 1,179 0,0 @@
pub mod file;
pub mod moves;
pub mod san;
pub mod tag;
pub mod util;

use std::fmt::Display;

use moves::{parse_moves, parse_result, GameResult, Moves};
use nom::{bytes::complete::take_while, IResult};
use smartstring::alias::String;
use tag::{parse_tags, Tags};
use thiserror::Error;

use self::{moves::ignore_comment, util::is_bom};

/// Contains the tags, move list, and result of a game parsed from PGN format.
#[derive(Clone, Debug)]
pub struct GameRecord {
    pub tags: Tags,
    pub moves: Moves,
    pub result: GameResult,
}

/// Indicates that there was an error while parsing
#[derive(Debug, Error)]
pub enum ParseError {
    BadInput(String),

    IOError(#[from] std::io::Error),

    NomError(nom::Err<nom::error::Error<std::string::String>>),
}

impl From<nom::Err<nom::error::Error<&str>>> for ParseError {
    fn from(value: nom::Err<nom::error::Error<&str>>) -> Self {
        Self::NomError(value.to_owned())
    }
}

impl Display for ParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            ParseError::BadInput(input) => {
                write!(f, "Encountered error while parsing. Input: <<{input}>>")
            }
            ParseError::IOError(e) => {
                write!(
                    f,
                    "Encountered error while trying to read file. Error: <<{}>>",
                    e
                )
            }
            ParseError::NomError(s) => {
                write!(f, "Encountered error while parsing. Error: <<{s}>>")
            }
        }
    }
}

pub fn parse_pgn(pgn: &str) -> Result<GameRecord, ParseError> {
    let (_remaining, game) = parse_game_record(pgn)?;
    Ok(game)
}

pub fn parse_game_record(input: &str) -> IResult<&str, GameRecord> {
    let (remaining, _) = remove_bom(input)?;
    let (remaining, tags) = parse_tags(remaining)?;
    let (remaining, moves) = parse_moves(remaining)?;
    let (remaining, _) = ignore_comment(remaining)?;
    let (remaining, game_result) = parse_result(remaining)?;

    let game = GameRecord {
        tags,  //: Tags::new(std::collections::HashMap::new()),
        moves, //: Moves { moves: vec![] },
        result: game_result,
    };

    Ok((remaining, game))
}

/// removes byte-order marks from the beginning of files
pub fn remove_bom(input: &str) -> IResult<&str, &str> {
    take_while(is_bom)(input)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_pgn() {
        let pgn = r#"
[Event "F/S Return Match"]
[Site "Belgrade, Serbia JUG"]
[Date "1992.11.04"]
[Round "29"]
[White "Fischer, Robert J."]
[Black "Spassky, Boris V."]
[Result "1/2-1/2"]

1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3
O-O 9. h3 Nb8 10. d4 Nbd7 11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15.
Nb1 h6 16. Bh4 c5 17. dxe5 Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21.
Nc4 Nxc4 22. Bxc4 Nb6 23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7
27. Qe3 Qg5 28. Qxg5 hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33.
f3 Bc8 34. Kf2 Bf5 35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5
40. Rd6 Kc5 41. Ra6 Nf2 42. g4 Bd3 43. Re6 1/2-1/2
"#;
        let game = parse_pgn(pgn).expect("successfully parses");

        assert_eq!(game.tags.event().unwrap(), "F/S Return Match");
        assert_eq!(game.tags.site().unwrap(), "Belgrade, Serbia JUG");
        assert_eq!(game.tags.date().unwrap(), "1992.11.04");
        assert_eq!(game.tags.round().unwrap(), "29");
        assert_eq!(game.tags.white().unwrap(), "Fischer, Robert J.");
        assert_eq!(game.tags.black().unwrap(), "Spassky, Boris V.");
        assert_eq!(game.tags.result().unwrap(), "1/2-1/2");
        assert_eq!(game.tags.tags.len(), 7);

        assert_eq!(game.moves.moves.len(), 85);
        assert_eq!(game.result, GameResult::Draw);
    }

    #[test]
    fn parses_without_movelist() {
        let pgn = r#"[Event "Chess.com SIG Bullet KO"]
[Site "chess.com INT"]
[Date "????.??.??"]
[Round "1.1"]
[White "Naroditsky, Daniel"]
[Black "Hansen, Eric"]
[Result "1-0"]
[WhiteElo "2621"]
[BlackElo "2606"]
[WhiteTitle "GM"]
[BlackTitle "GM"]
[WhiteFideId "2026961"]
[BlackFideId "2606771"]

1-0
"#;
        let game = parse_pgn(pgn).expect("successfully parses");

        assert_eq!(game.tags.tags.len(), 13);

        assert_eq!(game.moves.moves.len(), 0);
        assert_eq!(game.result, GameResult::WhiteWins);
    }

    #[test]
    // this is a problematic pgn I ran into while writing the parser
    fn parses_edgecase_paulmorphy1() {
        let pgn = r#"[Event "Unoffical Games From the 1. ACC"]  [Site "New York, NY USA"]  [Date "1857.??.??"]  [Round ""]  [White "Morphy, Paul"]  [Black "NN"]  [Result "1-0"]  [SetUp "1"]  [Opening "Evans Gambit Accepted"]  [Source1 "Shibut - Game 148"]  [Source2 "Maroczy - Game 078"]  [FEN "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/R1BQKBNR w KQkq - 0 1"]    1. e4 e5 2. Nf3 Nc6 3. Bc4 Bc5 4. b4 Bxb4 5. c3 Bc5 6. O-O Nf6 7. d4 exd4   8. cxd4 Bb6 9. Ba3 d6 10. e5 Ne4 11. Re1 d5 12. Bb5 Bg4 13. Rc1 Qd7 14.   Qa4 Bxf3 15. Rxc6 O-O-O 16. e6 fxe6 17. Rxb6 c6 18. Bxc6 bxc6 19. Qa6+ 1-0    "#;
        let game = parse_pgn(pgn).expect("successfully parses");

        assert_eq!(game.tags.tags.len(), 12);

        assert_eq!(game.moves.moves.len(), 37);
        assert_eq!(game.result, GameResult::WhiteWins);
    }

    #[test]
    fn parses_edgecase_pillsbury1() {
        let pgn = r#"[Event "Hastings"]  [Site "Congress"]  [Date "1895.??.??"]  [Round "21"]  [White "Pillsbury, Harry"]  [Black "Gunsberg, Isidor"]  [Result "1-0"]  [EventDate "1895.??.??"]  [Annotator "JvR"]  [SetUp "1"]  [PlyCount "28"]  [FEN "8/3nk2p/p3ppp1/1pPp4/3P1PP1/3NP3/P3K2P/8 b - - 0 26"]    26. ... Nb8 27. f5 g5 28. Nb4 a5 29. c6 Kd6 30. fxe6 Nxc6 31. Nxc6 Kxc6   32. e4 dxe4 33. d5+ Kd6 34. Ke3 b4 35. Kxe4 a4 36. Kd4 h5 37. gxh5 a3 38.   Kc4 f5 39. h6 f4 40. h7 1-0    "#;
        let game = parse_pgn(pgn).expect("successfully parses");

        assert_eq!(game.tags.tags.len(), 12);

        assert_eq!(game.moves.moves.len(), 29);
        assert_eq!(game.result, GameResult::WhiteWins);
    }

    #[test]
    fn parses_with_comment() {
        let pgn = r#"[Event "Foo"] { }1-0"#;
        let _ = parse_pgn(pgn).expect("successfully parses");
    }
}

D pgn/src/parse/moves.rs => pgn/src/parse/moves.rs +0 -90
@@ 1,90 0,0 @@
use nom::{
    branch::alt,
    bytes::complete::{tag, take_while},
    character::complete::char,
    combinator::{map_res, opt},
    multi::fold_many0,
    sequence::{delimited, pair, preceded, terminated},
    IResult,
};

use super::{san::parse_san, san::SANString, util::is_whitespace};
use crate::parse::ParseError;

#[derive(Clone, Debug)]
pub struct Moves {
    pub moves: Vec<SANString>,
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub enum GameResult {
    WhiteWins,
    BlackWins,
    Draw,
    Unknown,
}

impl TryFrom<&str> for GameResult {
    type Error = ParseError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        match value {
            "1/2-1/2" => Ok(GameResult::Draw),
            "1-0" => Ok(GameResult::WhiteWins),
            "0-1" => Ok(GameResult::BlackWins),
            "*" => Ok(GameResult::Unknown),
            _ => Err(ParseError::BadInput(value.into())),
        }
    }
}

/// parses out a list of moves in SAN format
pub fn parse_moves(input: &str) -> IResult<&str, Moves> {
    let remaining = input;
    let (remaining, _) = take_while(is_whitespace)(remaining)?;

    let (remaining, mut move_list) = fold_many0(
        preceded(parse_move_number, pair(parse_san, opt(parse_san))),
        Vec::new,
        |mut acc: Vec<_>, (m1, om2)| {
            acc.push(m1);
            if let Some(m2) = om2 {
                acc.push(m2);
            }
            acc
        },
    )(remaining)?;

    move_list.shrink_to_fit();
    let moves = Moves { moves: move_list };

    Ok((remaining, moves))
}

/// parses a comment, which is enclosed in { }.
pub fn ignore_comment(input: &str) -> IResult<&str, ()> {
    let (remaining, _) = take_while(is_whitespace)(input)?;

    let (remaining, _) = opt(delimited(
        char('{'),
        take_while(|c: char| c != '}'),
        char('}'),
    ))(remaining)?;

    Ok((remaining, ()))
}

/// parses out a game result ("1/2-1/2", "1-0", or "0-1")
pub fn parse_result(input: &str) -> IResult<&str, GameResult> {
    let (remaining, _) = take_while(is_whitespace)(input)?;
    map_res(
        alt((tag("1/2-1/2"), tag("1-0"), tag("0-1"), tag("*"))),
        |s: &str| s.try_into(),
    )(remaining)
}

/// parses out a move number, like "1." or "34.".
pub fn parse_move_number(input: &str) -> IResult<&str, &str> {
    let (remaining, _) = take_while(is_whitespace)(input)?;
    terminated(take_while(|c: char| c.is_ascii_digit()), char('.'))(remaining)
}

D pgn/src/parse/san.rs => pgn/src/parse/san.rs +0 -88
@@ 1,88 0,0 @@
use nom::{
    branch::alt,
    bytes::complete::{is_not, tag, take_while},
    character::complete::alpha1,
    combinator::recognize,
    multi::many0_count,
    sequence::pair,
    IResult,
};

use super::util::is_whitespace;
use super::util::{is_san_annotation, is_san_char};

const MAX_SAN: usize = 8;

/// A string representing a move in Standard Algebraic Notation.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SANString {
    raw: [u8; MAX_SAN],
    len: usize,
}

impl SANString {
    pub fn new(san: &str) -> Self {
        let mut inst = SANString {
            raw: [0; MAX_SAN],
            len: 0,
        };
        inst.len = MAX_SAN.min(san.as_bytes().len());
        inst.raw[..inst.len].copy_from_slice(&san.as_bytes()[..inst.len]);
        inst
    }

    fn str(&self) -> &str {
        // TODO: remove the manual array handling here once I figure out why
        // using String for raw is allocating so much more memory.
        unsafe { std::str::from_utf8_unchecked(&self.raw[..self.len]) }
    }

    /// retrieve the base portion, representing only the move (Nf3+, etc.)
    pub fn base(&self) -> &str {
        self.str().trim_end_matches(is_san_annotation)
    }

    /// retrieve the annotation, if any; will return an empty string if none
    pub fn annotation(&self) -> &str {
        self.str().trim_start_matches(is_san_char)
    }
}

impl From<&str> for SANString {
    fn from(value: &str) -> Self {
        SANString::new(value)
    }
}

/// parses out one SAN string, such as "axb8=Q#??"
pub fn parse_san(input: &str) -> IResult<&str, SANString> {
    let (remaining, _) = take_while(is_whitespace)(input)?;
    let (remaining, san) = alt((
        recognize(pair(alpha1, many0_count(is_not(" \t\r\n")))),
        tag("..."),
    ))(remaining)?;

    Ok((remaining, san.into()))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn separates_base_and_annotations() {
        let cases = vec![
            ("Nf3", ("Nf3", "")),
            ("axb3+", ("axb3+", "")),
            ("b8=Q", ("b8=Q", "")),
            ("Nf3!", ("Nf3", "!")),
            ("b8=Q??", ("b8=Q", "??")),
        ];

        for (input, (base, annotation)) in cases {
            let san = SANString::new(input);
            assert_eq!(base, san.base());
            assert_eq!(annotation, san.annotation());
        }
    }
}

D pgn/src/parse/tag.rs => pgn/src/parse/tag.rs +0 -87
@@ 1,87 0,0 @@
use std::collections::HashMap;

use nom::{
    bytes::complete::{escaped, is_not, take_till, take_while},
    character::complete::{char, one_of},
    multi::{fold_many0, many0},
    sequence::{delimited, preceded, terminated},
    IResult,
};
use smartstring::alias::String;

use super::util::is_whitespace;

/// Represents the Seven Tag Roster for a game, as well as any additional tags
/// provided.
///
/// If a tag contains multiple values, they are **not** split; that is done by
/// the consumer of the tags, optionally.
#[derive(Clone, Debug, Default)]
pub struct Tags {
    pub tags: HashMap<String, String>,
}

impl Tags {
    pub fn new(tags: HashMap<String, String>) -> Tags {
        Tags { tags }
    }

    pub fn event(&self) -> Option<&String> {
        self.tags.get("Event")
    }

    pub fn site(&self) -> Option<&String> {
        self.tags.get("Site")
    }

    pub fn date(&self) -> Option<&String> {
        self.tags.get("Date")
    }

    pub fn round(&self) -> Option<&String> {
        self.tags.get("Round")
    }

    pub fn white(&self) -> Option<&String> {
        self.tags.get("White")
    }

    pub fn black(&self) -> Option<&String> {
        self.tags.get("Black")
    }

    pub fn result(&self) -> Option<&String> {
        self.tags.get("Result")
    }
}

/// parses a list of tags until there are no more tags available to parse
pub fn parse_tags(input: &str) -> IResult<&str, Tags> {
    let whitespace_stripped_parse = preceded(take_while(is_whitespace), parse_tag);
    fold_many0(
        whitespace_stripped_parse,
        Tags::default,
        |mut acc: Tags, (key, value)| {
            acc.tags.insert(key.into(), value.into());
            acc
        },
    )(input)
}

/// parses a tag's key and value
fn parse_tag(input: &str) -> IResult<&str, (&str, &str)> {
    delimited(char('['), parse_tag_content, char(']'))(input)
}

/// parses out the key and value of a tag, within the delimiters
fn parse_tag_content(input: &str) -> IResult<&str, (&str, &str)> {
    let (remaining, key) = terminated(take_till(is_whitespace), take_while(is_whitespace))(input)?;

    let (remaining, value) = delimited(
        char('"'),
        escaped(many0(is_not("\"")), '\\', one_of("\"")),
        char('"'),
    )(remaining)?;

    Ok((remaining, (key, value)))
}

D pgn/src/parse/util.rs => pgn/src/parse/util.rs +0 -18
@@ 1,18 0,0 @@
pub fn is_whitespace(c: char) -> bool {
    c.is_whitespace()
}

const SAN_EXTRA: &[char] = &['=', '+', '#'];
const SAN_ANNOTATION: &[char] = &['?', '!'];

pub fn is_san_char(c: char) -> bool {
    c.is_alphanumeric() || SAN_EXTRA.contains(&c)
}

pub fn is_san_annotation(c: char) -> bool {
    SAN_ANNOTATION.contains(&c)
}

pub fn is_bom(c: char) -> bool {
    c == '\u{FEFF}'
}