~ntietz/isabella-db

5fecfcc6d07f4fd93b9cb2593cd11f59ef27b163 — Nicholas Tietz-Sokolsky 1 year, 1 month ago 2caaa56 13-parallel-lod-and-index
PGN preprocessor script
2 files changed, 39 insertions(+), 4 deletions(-)

M isabella/src/bin/idb.rs
A isabella/src/bin/preprocessor.rs
M isabella/src/bin/idb.rs => isabella/src/bin/idb.rs +5 -4
@@ 81,7 81,7 @@ fn main() {
}

fn index_shards(db_dir_path: &str) {
    let par_args: Vec<String> = read_dir(db_dir_path)
    let mut par_args: Vec<String> = read_dir(db_dir_path)
        .expect("should read directory")
        .map(|file| {
            file.expect("file should exist")


@@ 92,19 92,20 @@ fn index_shards(db_dir_path: &str) {
        })
        .filter(|p| p.ends_with(".isa"))
        .collect();
    par_args.reverse();

    par_args.par_iter().for_each(|path| {
        let mut file: File = File::open(&path).expect("should open the file");
        let mut buf: Vec<u8> = Vec::new();
        file.read_to_end(&mut buf).expect("should read");
        let db: GameDB = bincode::deserialize(&buf).expect("deserializing should work");
        tracing::info!("loaded {} games", db.len());
        tracing::info!(path=path, "loaded {} games", db.len());
        drop(buf);
        drop(file);
        tracing::info!("dropped buf and file");
        tracing::info!(path=path, "dropped buf and file");
        let results_index = GameResultIndex::construct(&db);
        let position_index = PositionIndex::construct(&db);
        save_all(&path, &position_index, &results_index);
        tracing::info!("finished saving indexes");
        tracing::info!(path=path, "finished saving indexes");
    });
}

A isabella/src/bin/preprocessor.rs => isabella/src/bin/preprocessor.rs +34 -0
@@ 0,0 1,34 @@
use std::fs::{read_to_string, write};

use clap::Parser;


#[derive(Parser, Debug)]
struct Args {
    pgn_input_path: String,
    pgn_output_dir: String,

    #[arg(short, long)]
    num_shards: usize,
}

pub fn main() {
    let args = Args::parse();

    let full_pgn = read_to_string(&args.pgn_input_path).unwrap();
    let parts: Vec<&str> = full_pgn.split("\n\n").collect();

    let mut shards: Vec<String> = Vec::with_capacity(args.num_shards);
    for _ in 0..args.num_shards {
        shards.push(String::new());
    }

    for (idx, chunk) in parts[..].chunks(2).enumerate() {
        shards[idx % args.num_shards] += &chunk.join("\n\n");
    }

    for (idx, shard) in shards.iter().enumerate() {
        let path = format!("{}/shard-{idx}.pgn", args.pgn_output_dir);
        write(path, shard).unwrap();
    }
}