~ntietz/isabella-db

08eebe877415eb69761a769ed56631bcacef7e52 — Nicole Tietz-Sokolskaya 1 year, 1 month ago 29aacce
Parallelize loading of PNGs and add script to split PNGs into
smaller files for more efficient loading.

Implements: https://todo.sr.ht/~ntietz/isabella-db/13
M Cargo.lock => Cargo.lock +3 -3
@@ 866,6 866,7 @@ dependencies = [
 "criterion",
 "pgn",
 "rand",
 "rayon",
 "rmp-serde",
 "serde",
 "shakmaty",


@@ 1287,11 1288,10 @@ dependencies = [

[[package]]
name = "rayon"
version = "1.6.0"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e060280438193c554f654141c9ea9417886713b7acd75974c85b18a69a88e0b"
checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7"
dependencies = [
 "crossbeam-deque",
 "either",
 "rayon-core",
]

M Cargo.toml => Cargo.toml +1 -0
@@ 4,6 4,7 @@ members = ["bitmap", "isabella", "pgn"]

[workspace.dependencies]

rayon = "1.6.1"
serde = { version = "1.0", features = ["derive"] }
shakmaty = "0.22.0"
thiserror = "1.0.37"

M isabella/Cargo.toml => isabella/Cargo.toml +2 -0
@@ 27,6 27,8 @@ actix-web = "4.2.1"
actix-files = "0.6.2"
askama = "0.11.1"

rayon = { workspace = true }

[dev-dependencies]
criterion = { workspace = true }


M isabella/src/bin/idb.rs => isabella/src/bin/idb.rs +53 -21
@@ 1,10 1,11 @@
use std::fs::File;
use std::fs::{read_dir, File};
use std::io::{Read, Write};

use clap::{Parser, Subcommand};
use isabella_db::db::GameDB;
use isabella_db::db::{load_from_file, GameDB};
use isabella_db::index::{save::save_all, GameResultIndex, PositionIndex};
use pgn::load::PgnFile;
use rayon::prelude::*;
use tracing_subscriber::fmt::format::FmtSpan;
use tracing_subscriber::EnvFilter;



@@ 21,6 22,7 @@ struct Args {
enum Commands {
    Convert { pgn_filename: String },
    Index,
    ConvertBatch { pgn_directory: String },
}

fn main() {


@@ 53,27 55,57 @@ fn main() {
            std::mem::forget(buf);
            std::mem::forget(db);
        }
        Commands::Index => {
            let mut file: File = File::open(&args.gamedb_filename).expect("should open the file");
            let mut buf: Vec<u8> = Vec::new();
            file.read_to_end(&mut buf).expect("should read");

            let db: GameDB = bincode::deserialize(&buf).expect("deserializing should work");
            tracing::info!("loaded {} games", db.len());
            drop(buf);
            drop(file);
            tracing::info!("dropped buf and file");

            let results_index = GameResultIndex::construct(&db);
            let position_index = PositionIndex::construct(&db);
        Commands::ConvertBatch { pgn_directory } => {
            // TODO: make directory, clean up if it exists already or warn?
            let output_dir = args.gamedb_filename;

            save_all(&args.gamedb_filename, &position_index, &results_index);
            tracing::info!("finished saving indexes");
            let par_args: Vec<(String, String)> = read_dir(pgn_directory)
                .expect("should read directory")
                .enumerate()
                .map(|(idx, file)| {
                    let entry_path = file.expect("file should exist").path();
                    let pgn_path = entry_path.to_str().unwrap();
                    let output_path = format!("{output_dir}/shard-{idx}.isa");
                    (pgn_path.to_string(), output_path)
                })
                .collect();

            // To speed up exit times, we just forget about the memory without dropping it.
            std::mem::forget(db);
            std::mem::forget(results_index);
            std::mem::forget(position_index);
            par_args
                .par_iter()
                .for_each(|(input_path, output_path)| load_from_file(input_path, output_path));
        }
        Commands::Index => {
            index_shards(&args.gamedb_filename);
        }
    };
}

fn index_shards(db_dir_path: &str) {
    let mut par_args: Vec<String> = read_dir(db_dir_path)
        .expect("should read directory")
        .map(|file| {
            file.expect("file should exist")
                .path()
                .to_str()
                .unwrap()
                .to_string()
        })
        .filter(|p| p.ends_with(".isa"))
        .collect();
    par_args.reverse();

    par_args.par_iter().for_each(|path| {
        let mut file: File = File::open(path).expect("should open the file");
        let mut buf: Vec<u8> = Vec::new();
        file.read_to_end(&mut buf).expect("should read");
        let db: GameDB = bincode::deserialize(&buf).expect("deserializing should work");
        tracing::info!(path = path, "loaded {} games", db.len());
        drop(buf);
        drop(file);
        tracing::info!(path = path, "dropped buf and file");
        let results_index = GameResultIndex::construct(&db);
        let position_index = PositionIndex::construct(&db);
        save_all(path, &position_index, &results_index);
        tracing::info!(path = path, "finished saving indexes");
    });
}

A isabella/src/bin/preprocessor.rs => isabella/src/bin/preprocessor.rs +33 -0
@@ 0,0 1,33 @@
use std::fs::{read_to_string, write};

use clap::Parser;

#[derive(Parser, Debug)]
struct Args {
    pgn_input_path: String,
    pgn_output_dir: String,

    #[arg(short, long)]
    num_shards: usize,
}

pub fn main() {
    let args = Args::parse();

    let full_pgn = read_to_string(&args.pgn_input_path).unwrap();
    let parts: Vec<&str> = full_pgn.split("\n\n").collect();

    let mut shards: Vec<String> = Vec::with_capacity(args.num_shards);
    for _ in 0..args.num_shards {
        shards.push(String::new());
    }

    for (idx, chunk) in parts[..].chunks(2).enumerate() {
        shards[idx % args.num_shards] += &chunk.join("\n\n");
    }

    for (idx, shard) in shards.iter().enumerate() {
        let path = format!("{}/shard-{idx}.pgn", args.pgn_output_dir);
        write(path, shard).unwrap();
    }
}

M isabella/src/db/mod.rs => isabella/src/db/mod.rs +20 -0
@@ 1,4 1,6 @@
use std::fmt::Debug;
use std::fs::File;
use std::io::Write;

use pgn::PgnFile;
use serde::{Deserialize, Serialize};


@@ 88,3 90,21 @@ impl GameDB {
        db
    }
}

pub fn load_from_file(pgn_path: &str, output_path: &str) {
    let file = PgnFile::new(pgn_path).expect("should open the file");
    let mut db = GameDB::from_pgn(file);
    db.sort();

    tracing::info!("constructed database");
    let buf = bincode::serialize(&db).expect("serializing should work");
    tracing::info!("serialized database");

    let mut outfile: File = File::options()
        .write(true)
        .create(true)
        .open(output_path)
        .expect("should open file to write");
    outfile.write_all(&buf).expect("should write to file");
    tracing::info!("wrote database");
}

M isabella/src/web/templates.rs => isabella/src/web/templates.rs +1 -1
@@ 170,7 170,7 @@ pub fn html_display_piece(piece: Option<Piece>) -> String {
            (Color::Black, Role::King) => "bK",
        };

        format!("<img src=\"/static/images/chess/wikipedia/{}.png\" />", s)
        format!("<img src=\"/static/images/chess/wikipedia/{s}.png\" />")
    } else {
        "&nbsp;".into()
    }