Merge pull request #342 from adamantivm/jac/eval_actions

adamantivm · web-flow · commit bf40c32c8084 · 2026-02-19T13:05:39.000-03:00
Initial skeleton and binary for self-play rust
diff --git a/deep_quoridor/agents.md b/deep_quoridor/agents.md
@@ -1 +1,3 @@
-Whenever you commit to git, create the commit message starting with "vibe: " and then a one line summary of the changes.
+Whenever you commit to git, create the commit message starting with "vibe: " and then a one line summary of the changes.
+
+Whenever you change rust files, before commit, make sure to run cargo fmt to format all files and then check formatting, build and run before committing.
diff --git a/deep_quoridor/rust/Cargo.toml b/deep_quoridor/rust/Cargo.toml
@@ -16,6 +16,11 @@ name = "create_policy_db"
 path = "src/bin/create_policy_db.rs"
 required-features = ["binary"]
 
+[[bin]]
+name = "selfplay"
+path = "src/bin/selfplay.rs"
+required-features = ["binary"]
+
 [dependencies]
 pyo3 = { version = "0.22", features = ["extension-module"], optional = true }
 numpy = { version = "0.22", optional = true }
@@ -25,11 +30,13 @@ rayon = "1.10"
 rusqlite = { version = "0.32", features = ["bundled"] }
 serde = { version = "1", features = ["derive"] }
 clap = { version = "4.5", features = ["derive"], optional = true }
+ort = { version = "2.0.0-rc.11", optional = true }
+anyhow = { version = "1", optional = true }
 
 [features]
 default = ["python"]
 python = ["pyo3", "numpy"]
-binary = ["clap"]
+binary = ["clap", "ort", "anyhow"]
 
 [profile.release]
 # Enable optimizations for better performance
diff --git a/deep_quoridor/rust/src/bin/selfplay.rs b/deep_quoridor/rust/src/bin/selfplay.rs
@@ -0,0 +1,255 @@
+#!/usr/bin/env rust
+//! Self-play executable using ONNX inference for Quoridor.
+//!
+//! This binary loads a trained ONNX model and uses it to evaluate actions
+//! on a Quoridor game board, applying the selected action and displaying the result.
+
+use anyhow::{Context, Result};
+use ndarray::Array1;
+use ort::session::Session;
+
+use quoridor_rs::actions::{get_valid_move_actions, get_valid_wall_actions};
+use quoridor_rs::game_state::{apply_action, create_initial_state};
+use quoridor_rs::grid_helpers::grid_game_state_to_resnet_input;
+
+/// Convert 4D array to 1D vector for ONNX input
+fn array4d_to_vec(arr: &ndarray::Array4<f32>) -> Vec<f32> {
+    arr.iter().copied().collect()
+}
+
+/// Compute softmax values for policy logits
+///
+/// Note: While ORT's OrtOwnedTensor has a softmax method, using it would require
+/// copying the logits from the borrowed slice (&[f32]) returned by try_extract_tensor
+/// into an owned OrtOwnedTensor structure. This data copy would be inefficient and
+/// defeat the purpose of using a pre-built library function, so we implement softmax
+/// directly on the borrowed slice instead.
+fn softmax(logits: &[f32]) -> Vec<f32> {
+    let max = logits.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let exp_values: Vec<f32> = logits.iter().map(|&x| (x - max).exp()).collect();
+    let sum: f32 = exp_values.iter().sum();
+    exp_values.iter().map(|&x| x / sum).collect()
+}
+
+/// Evaluate an action using the ONNX model
+///
+/// Returns the chosen action as [row, col, action_type]
+fn evaluate_action(
+    session: &mut Session,
+    grid: &ndarray::ArrayView2<i8>,
+    player_positions: &ndarray::ArrayView2<i32>,
+    walls_remaining: &ndarray::ArrayView1<i32>,
+    goal_rows: &ndarray::ArrayView1<i32>,
+    current_player: i32,
+) -> Result<Array1<i32>> {
+    // Convert game state to ResNet input format
+    let resnet_input_tensor =
+        grid_game_state_to_resnet_input(grid, player_positions, walls_remaining, current_player);
+
+    // Convert to ONNX input format
+    let shape = resnet_input_tensor.shape().to_vec();
+    let data = array4d_to_vec(&resnet_input_tensor);
+    let input_value = ort::value::Value::from_array((shape.as_slice(), data))
+        .context("Failed to create ResNet input value")?;
+
+    // Run inference
+    let outputs = session
+        .run(ort::inputs!["input" => input_value])
+        .context("Failed to run ResNet inference")?;
+
+    // Extract policy logits
+    let policy_logits = outputs["policy_logits"]
+        .try_extract_tensor::<f32>()
+        .context("Failed to extract policy logits")?;
+
+    // Convert to probabilities
+    let policy_probs = softmax(policy_logits.1);
+
+    // Get all valid actions
+    let move_actions = get_valid_move_actions(grid, player_positions, current_player);
+    let wall_actions = get_valid_wall_actions(
+        grid,
+        player_positions,
+        walls_remaining,
+        goal_rows,
+        current_player,
+    );
+
+    // Calculate action sizes
+    let grid_width = grid.ncols() as i32;
+    let board_size = (grid_width - 4) / 2 + 1;
+    let num_move_actions = board_size * board_size;
+    let wall_size = board_size - 1;
+    let num_wall_actions = wall_size * wall_size;
+
+    // Find best valid action
+    let mut best_action_idx = 0;
+    let mut best_prob = f32::NEG_INFINITY;
+
+    // Check move actions
+    for i in 0..move_actions.nrows() {
+        let row = move_actions[[i, 0]];
+        let col = move_actions[[i, 1]];
+        let action_idx = (row * board_size + col) as usize;
+
+        if action_idx < policy_probs.len() && policy_probs[action_idx] > best_prob {
+            best_prob = policy_probs[action_idx];
+            best_action_idx = i;
+        }
+    }
+
+    // Check wall actions
+    for i in 0..wall_actions.nrows() {
+        let row = wall_actions[[i, 0]];
+        let col = wall_actions[[i, 1]];
+        let action_type = wall_actions[[i, 2]];
+
+        // Calculate action index
+        let wall_base_idx = if action_type == 1 {
+            // Horizontal wall
+            (num_move_actions + row * wall_size + col) as usize
+        } else {
+            // Vertical wall
+            (num_move_actions + num_wall_actions + row * wall_size + col) as usize
+        };
+
+        if wall_base_idx < policy_probs.len() && policy_probs[wall_base_idx] > best_prob {
+            best_prob = policy_probs[wall_base_idx];
+            best_action_idx = move_actions.nrows() + i;
+        }
+    }
+
+    // Return the chosen action
+    if best_action_idx < move_actions.nrows() {
+        Ok(Array1::from_vec(vec![
+            move_actions[[best_action_idx, 0]],
+            move_actions[[best_action_idx, 1]],
+            move_actions[[best_action_idx, 2]],
+        ]))
+    } else {
+        let wall_idx = best_action_idx - move_actions.nrows();
+        Ok(Array1::from_vec(vec![
+            wall_actions[[wall_idx, 0]],
+            wall_actions[[wall_idx, 1]],
+            wall_actions[[wall_idx, 2]],
+        ]))
+    }
+}
+
+/// Print the game board
+fn print_board(
+    grid: &ndarray::ArrayView2<i8>,
+    player_positions: &ndarray::ArrayView2<i32>,
+    walls_remaining: &ndarray::ArrayView1<i32>,
+) {
+    let grid_width = grid.ncols() as i32;
+    let board_size = (grid_width - 4) / 2 + 1;
+
+    println!("\n=== Game Board ({}x{}) ===", board_size, board_size);
+    println!(
+        "Player 0 (P0): Position ({}, {}), Walls remaining: {}",
+        player_positions[[0, 0]],
+        player_positions[[0, 1]],
+        walls_remaining[0]
+    );
+    println!(
+        "Player 1 (P1): Position ({}, {}), Walls remaining: {}",
+        player_positions[[1, 0]],
+        player_positions[[1, 1]],
+        walls_remaining[1]
+    );
+    println!();
+
+    // Print the board (showing only player positions and walls)
+    for row in 0..board_size {
+        for col in 0..board_size {
+            let grid_row = (row * 2 + 2) as usize;
+            let grid_col = (col * 2 + 2) as usize;
+
+            let cell = grid[[grid_row, grid_col]];
+            if cell == 0 {
+                print!("P0 ");
+            } else if cell == 1 {
+                print!("P1 ");
+            } else {
+                print!(" . ");
+            }
+        }
+        println!();
+    }
+    println!();
+}
+
+fn main() -> Result<()> {
+    println!("=== Quoridor Self-Play with ONNX Inference ===\n");
+
+    // Hardcoded model path (relative to rust directory)
+    let model_path = "../../experiments/onnx/B5W3_resnet_sample.onnx";
+
+    println!("Loading ONNX model from: {}", model_path);
+
+    // Load ONNX model
+    let mut session = Session::builder()
+        .context("Failed to create session builder")?
+        .commit_from_file(model_path)
+        .context("Failed to load ONNX model")?;
+
+    println!("✓ Model loaded successfully!\n");
+
+    // Game configuration (must match the trained model)
+    let board_size = 5;
+    let max_walls = 3;
+
+    println!(
+        "Game configuration: {}x{} board, {} walls per player\n",
+        board_size, board_size, max_walls
+    );
+
+    // Create initial game state
+    let (mut grid, mut player_positions, mut walls_remaining, goal_rows) =
+        create_initial_state(board_size, max_walls);
+    let current_player = 0;
+
+    println!("Initial board:");
+    print_board(
+        &grid.view(),
+        &player_positions.view(),
+        &walls_remaining.view(),
+    );
+
+    // Evaluate action using ONNX model
+    println!("Evaluating action for Player {}...", current_player);
+    let action = evaluate_action(
+        &mut session,
+        &grid.view(),
+        &player_positions.view(),
+        &walls_remaining.view(),
+        &goal_rows.view(),
+        current_player,
+    )?;
+
+    println!(
+        "Selected action: row={}, col={}, type={}",
+        action[0], action[1], action[2]
+    );
+
+    // Apply the action
+    apply_action(
+        &mut grid.view_mut(),
+        &mut player_positions.view_mut(),
+        &mut walls_remaining.view_mut(),
+        current_player,
+        &action.view(),
+    );
+
+    println!("\nBoard after applying action:");
+    print_board(
+        &grid.view(),
+        &player_positions.view(),
+        &walls_remaining.view(),
+    );
+
+    println!("✓ Self-play demonstration completed successfully!");
+
+    Ok(())
+}
diff --git a/deep_quoridor/rust/src/game_state.rs b/deep_quoridor/rust/src/game_state.rs
@@ -1,10 +1,71 @@
 #![allow(dead_code)]
 
-use ndarray::{ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
+use ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
 
 use crate::actions::{ACTION_MOVE, ACTION_WALL_HORIZONTAL, ACTION_WALL_VERTICAL};
 use crate::grid::{set_wall_cells, CELL_FREE, CELL_WALL};
 
+/// Initialize the initial game state for a Quoridor board
+///
+/// Creates the initial game state for a Quoridor board with:
+/// - A grid of size (board_size * 2 + 3) x (board_size * 2 + 3)
+/// - Border walls around the perimeter
+/// - Players positioned at top and bottom center
+/// - Specified number of walls for each player
+///
+/// # Arguments
+/// * `board_size` - The size of the board (e.g., 5 for a 5x5 board, 9 for standard Quoridor)
+/// * `max_walls` - Number of walls each player starts with
+///
+/// # Returns
+/// A tuple containing:
+/// * `grid` - The game grid with border walls and player positions
+/// * `player_positions` - Array of player positions [player_id, [row, col]]
+/// * `walls_remaining` - Array of walls remaining for each player
+/// * `goal_rows` - Array of goal rows for each player
+pub fn create_initial_state(
+    board_size: i32,
+    max_walls: i32,
+) -> (Array2<i8>, Array2<i32>, Array1<i32>, Array1<i32>) {
+    let grid_size = (board_size * 2 + 3) as usize;
+
+    let mut grid = Array2::<i8>::from_elem((grid_size, grid_size), CELL_FREE);
+
+    // Add border walls
+    for i in 0..2 {
+        for j in 0..grid_size {
+            grid[[i, j]] = CELL_WALL;
+            grid[[grid_size - 1 - i, j]] = CELL_WALL;
+            grid[[j, i]] = CELL_WALL;
+            grid[[j, grid_size - 1 - i]] = CELL_WALL;
+        }
+    }
+
+    let mut player_positions = Array2::<i32>::zeros((2, 2));
+    let center_col = board_size / 2;
+
+    // Player 0 starts at top center
+    player_positions[[0, 0]] = 0;
+    player_positions[[0, 1]] = center_col;
+    // Player 1 starts at bottom center
+    player_positions[[1, 0]] = board_size - 1;
+    player_positions[[1, 1]] = center_col;
+
+    // Place players on grid (grid coords are board_coords * 2 + 2)
+    let p0_grid_row = (player_positions[[0, 0]] * 2 + 2) as usize;
+    let p0_grid_col = (player_positions[[0, 1]] * 2 + 2) as usize;
+    let p1_grid_row = (player_positions[[1, 0]] * 2 + 2) as usize;
+    let p1_grid_col = (player_positions[[1, 1]] * 2 + 2) as usize;
+
+    grid[[p0_grid_row, p0_grid_col]] = 0;
+    grid[[p1_grid_row, p1_grid_col]] = 1;
+
+    let walls_remaining = Array1::from(vec![max_walls, max_walls]);
+    let goal_rows = Array1::from(vec![board_size - 1, 0]); // Player 0 wants bottom, Player 1 wants top
+
+    (grid, player_positions, walls_remaining, goal_rows)
+}
+
 /// Check if a player has won by reaching their goal row.
 ///
 /// This is a direct port of check_win from qgrid.py.
diff --git a/deep_quoridor/rust/src/grid_helpers.rs b/deep_quoridor/rust/src/grid_helpers.rs
diff --git a/deep_quoridor/rust/src/lib.rs b/deep_quoridor/rust/src/lib.rs