From 56343b1bab984df7edf5b6871118dc6ea6064e6c Mon Sep 17 00:00:00 2001 From: PhilipGe Date: Mon, 17 Mar 2025 21:49:09 -0400 Subject: [PATCH 1/3] Created initial implementation of QSession class that provides an APT for DeepQ model training and utilization. Calls to API can be injected into code of agent traversing a state space to serve Q values. API serves Q values based on a calling agent's state, action, resulting state if action is taken, and resulting reward if action is taken: (s,a,s',r) by passing them through a neural net. Nueral net is trained through the 'explore' function that updates weights based on a specific instance of (s,a,s',r). --- deep_q/deep_q_session.py | 135 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 deep_q/deep_q_session.py diff --git a/deep_q/deep_q_session.py b/deep_q/deep_q_session.py new file mode 100644 index 0000000..0730798 --- /dev/null +++ b/deep_q/deep_q_session.py @@ -0,0 +1,135 @@ +from __future__ import annotations +import numpy as np +import torch +from q_model import QModel + +# Utility function. Outside of QSession because requires knowledge of actions that would take place +def get_maximal_Q(session: QSession, s_prime): + + # Get all of the actions that can be taken from s_prime + actions = [] + + action_Qs = np.zeros(len(actions)) + + for i, a in enumerate(actions): + action_Qs[i] = session.forward_no_grad(session.encode(s_prime, a)) + + return action_Qs.max() + +class QModel(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, data): + return data + +class QSession: + + def __init__(self): + + # Initialized in either new_model or load_model + self.model: torch.nn.Module = None + self.optimizer = None + + # This will likely just hold a list of tensors + self.replay_buffer = [] + + # The weight of future rewards. Set this to something reasonable + self.gamma = 0.9 + + # Somehow encodes a state and action into a tensor + def encode(self, state, action) -> torch.Tensor: + pass + + # Set this to a reasonable loss function + def loss_function(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + return (x-y).square().mean() + + # Will store in replay_buffer + def register_in_buffer(self, state, action, reward, s_prime): + self.replay_buffer( + [state, action, reward, s_prime] + ) + + # This trains network on values seen in the past to regularize + def train_on_past_values(self, batch_size): + + # Sample random values from replay_buffer + + samples = np.random.randint(0,len(self.replay_buffer), size=batch_size) + + for i in samples: + self.explore(*self.replay_buffer[i], new=False) + + # Updates network based on single step. + def explore(self, state, action, reward, s_prime, new = True) -> torch.Tensor : + + if(new): self.register_in_buffer(state, action, reward, s_prime) + + input_encoded = session.encode(state, action) + + # Forward pass of the network + pred_Q = self.model(input_encoded) + + # Get Q target of network + target_Q = reward + get_maximal_Q(self, s_prime) + + # Calculate loss + loss = self.loss_function(pred_Q, target_Q) + + # Update weight + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + self.train_on_past_values() + + # Return loss + return loss + + # This is used to determine max_a'(Q(s',a')) + def forward_no_grad(self, state, action): + + with torch.no_grad(): + pred_Q = self.model(self.encode(state, action)) + + return pred_Q + + def new_model(self): + self.model = QModel() + self.optimizer = torch.optim.Adam( + filter(lambda x: x.requires_grad, self.model.parameters()), + lr=0.001, + ) + + def save_model(self, save_path): + torch.save(self.model, save_path) + + def load_model(self, model_filepath): + self.model = torch.load(model_filepath) + self.optimizer = torch.optim.Adam( + filter(lambda x: x.requires_grad, self.model.parameters()), + lr=0.001, + ) + +if __name__ == "__main__": + + # Example instance + session = QSession() + session.new_model() + + # Agent will traverse the state space. As it traverses, gets states and actions, and rewards from environement. + # Run this as many times as you want, exploring different parts of state space + state, action, reward, s_prime = None, None, None, None + session.explore(state, action, reward, s_prime) + + # Save model trained somewhere + save_path = "" + session.save_model(save_path) + + # Load later + session.load_model(save_path) + + # Keep training, or go ahead and actually use by calling forward_no_grad + pred_Q = session.forward_no_grad(state, action) \ No newline at end of file From fb9bf8a54eb86a3b5afb42efc8311c42c1463e38 Mon Sep 17 00:00:00 2001 From: PhilipGe Date: Tue, 18 Mar 2025 01:34:00 -0400 Subject: [PATCH 2/3] Updated 'encode' to match the neural net inputs. Updated 'train' to fix an infinite recursion bug. --- deep_q/deep_q_session.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/deep_q/deep_q_session.py b/deep_q/deep_q_session.py index 0730798..8d79acb 100644 --- a/deep_q/deep_q_session.py +++ b/deep_q/deep_q_session.py @@ -12,7 +12,7 @@ def get_maximal_Q(session: QSession, s_prime): action_Qs = np.zeros(len(actions)) for i, a in enumerate(actions): - action_Qs[i] = session.forward_no_grad(session.encode(s_prime, a)) + action_Qs[i] = session.get_Q(session.encode(s_prime, a)) return action_Qs.max() @@ -39,7 +39,7 @@ def __init__(self): self.gamma = 0.9 # Somehow encodes a state and action into a tensor - def encode(self, state, action) -> torch.Tensor: + def encode(self, state, action, reward, s_prime) -> torch.Tensor: pass # Set this to a reasonable loss function @@ -60,20 +60,20 @@ def train_on_past_values(self, batch_size): samples = np.random.randint(0,len(self.replay_buffer), size=batch_size) for i in samples: - self.explore(*self.replay_buffer[i], new=False) + self.train(*self.replay_buffer[i], new=False) # Updates network based on single step. - def explore(self, state, action, reward, s_prime, new = True) -> torch.Tensor : + def train(self, state, action, reward, s_prime, new = True) -> torch.Tensor : if(new): self.register_in_buffer(state, action, reward, s_prime) - input_encoded = session.encode(state, action) + input_encoded = session.encode(state, action, reward, s_prime) # Forward pass of the network pred_Q = self.model(input_encoded) # Get Q target of network - target_Q = reward + get_maximal_Q(self, s_prime) + target_Q = reward + get_maximal_Q(self, s_prime)*self.gamma # Calculate loss loss = self.loss_function(pred_Q, target_Q) @@ -83,13 +83,12 @@ def explore(self, state, action, reward, s_prime, new = True) -> torch.Tensor : loss.backward() self.optimizer.step() - self.train_on_past_values() + if(new): self.train_on_past_values() # Return loss return loss - # This is used to determine max_a'(Q(s',a')) - def forward_no_grad(self, state, action): + def get_Q(self, state, action): with torch.no_grad(): pred_Q = self.model(self.encode(state, action)) @@ -122,7 +121,7 @@ def load_model(self, model_filepath): # Agent will traverse the state space. As it traverses, gets states and actions, and rewards from environement. # Run this as many times as you want, exploring different parts of state space state, action, reward, s_prime = None, None, None, None - session.explore(state, action, reward, s_prime) + session.train(state, action, reward, s_prime) # Save model trained somewhere save_path = "" @@ -132,4 +131,4 @@ def load_model(self, model_filepath): session.load_model(save_path) # Keep training, or go ahead and actually use by calling forward_no_grad - pred_Q = session.forward_no_grad(state, action) \ No newline at end of file + pred_Q = session.get_Q(state, action) \ No newline at end of file From 078f61a5d16442cd863598c9044e3575a9f791a8 Mon Sep 17 00:00:00 2001 From: skinreich11 Date: Sun, 27 Apr 2025 12:26:09 -0400 Subject: [PATCH 3/3] Pushing encoding class, algorithms, and logic --- game_encoder/__init__.py | 0 game_encoder/game_encoder.py | 97 ++++++++++++++++++++++++++++++++++++ raise_player.py | 28 +++++++++-- randomplayer.py | 37 ++++++++++---- 4 files changed, 148 insertions(+), 14 deletions(-) create mode 100644 game_encoder/__init__.py create mode 100644 game_encoder/game_encoder.py diff --git a/game_encoder/__init__.py b/game_encoder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/game_encoder/game_encoder.py b/game_encoder/game_encoder.py new file mode 100644 index 0000000..4c667f0 --- /dev/null +++ b/game_encoder/game_encoder.py @@ -0,0 +1,97 @@ +import numpy as np + +class PokerGameEncoder: + _suit_offset = {'C': 0, 'D': 13, 'H': 26, 'S': 39} + _rank_value = {'2':0, '3':1, '4':2, '5':3, '6':4, '7':5, '8':6, '9':7, 'T':8, 'J':9, 'Q':10, 'K':11, 'A':12} + _pre_rounds = {'river': 'turn', 'turn': 'flop', 'flop': 'preflop'} + + def __init__(self, our_uuid, our_stack, op_uuid, op_stack, game_info): + self.our_uuid = our_uuid + self.opponent_uuid = op_uuid + self.hole_cards_vector = np.zeros(52, dtype=np.float32) + self.community_cards_vector = np.zeros(52, dtype=np.float32) + self.our_total_money = our_stack + self.opponent_total_money = op_stack + self.our_investment = 0 + self.opponent_investment = 0 + self.round_count = 1 + self.seats = game_info.get('seats') + self.prev_our_pot = our_stack + self.prev_op_pot = op_stack + + @classmethod + def card_to_index(cls, card): + return cls._suit_offset[card[0]] + cls._rank_value[card[1]] + + @classmethod + def encode_cards(cls, cards): + vec = np.zeros(52, dtype=np.float32) + for card in cards: + vec[cls.card_to_index(card)] = 1.0 + return vec + + def get_stack(self, uuid, new_game_data): + for seat in new_game_data.get('seats'): + if seat['uuid'] == uuid: + return seat['stack'] + raise ValueError(f"UUID {uuid} not found in seats") + + def update(self, new_game_data, new_hole_cards=None): + if new_hole_cards: + self.hole_cards_vector = self.encode_cards(new_hole_cards) + self.community_cards_vector = self.encode_cards(new_game_data.get('community_card', [])) + + if new_game_data['round_count'] != self.round_count: + self.round_count = new_game_data['round_count'] + small_blind_pos = new_game_data['small_blind_pos'] + small_blind_amount = new_game_data['small_blind_amount'] + + small_blind_uuid = new_game_data['seats'][small_blind_pos]['uuid'] + + if new_game_data.get('action_histories').get('preflop')[-1].get('action') == 'BIGBLIND': + if self.our_uuid == small_blind_uuid: + self.our_investment = small_blind_amount + self.opponent_investment = 2 * small_blind_amount + self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + small_blind_amount + self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + 2 * small_blind_amount + self.our_total_money = self.get_stack(self.our_uuid, new_game_data) + self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data) + else: + self.our_investment = 2 * small_blind_amount + self.opponent_investment = small_blind_amount + self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + 2 * small_blind_amount + self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + small_blind_amount + self.our_total_money = self.get_stack(self.our_uuid, new_game_data) + self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data) + else: + last_opponent_paid = new_game_data.get('action_histories').get('preflop')[-1].get('paid') + if self.our_uuid == small_blind_uuid: + self.our_investment = small_blind_amount + self.opponent_investment = 2 * small_blind_amount + last_opponent_paid + self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + small_blind_amount + self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + 2 * small_blind_amount + last_opponent_paid + self.our_total_money = self.get_stack(self.our_uuid, new_game_data) + self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data) + else: + self.our_investment = 2 * small_blind_amount + self.opponent_investment = small_blind_amount + last_opponent_paid + self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + 2 * small_blind_amount + self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + small_blind_amount + last_opponent_paid + self.our_total_money = self.get_stack(self.our_uuid, new_game_data) + self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data) + else: + self.round_count = new_game_data['round_count'] + self.our_total_money = self.get_stack(self.our_uuid, new_game_data) + self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data) + self.our_investment = self.prev_our_pot - self.our_total_money + self.opponent_investment = self.prev_op_pot - self.opponent_total_money + + def get_features(self): + return { + 'hole_cards_vector': self.hole_cards_vector, + 'community_cards_vector': self.community_cards_vector, + 'our_total_money': self.our_total_money, + 'opponent_total_money': self.opponent_total_money, + 'our_investment_this_round': self.our_investment, + 'opponent_investment_this_round': self.opponent_investment, + } \ No newline at end of file diff --git a/raise_player.py b/raise_player.py index efccb56..7da00e2 100644 --- a/raise_player.py +++ b/raise_player.py @@ -1,19 +1,41 @@ from pypokerengine.players import BasePokerPlayer from time import sleep +from game_encoder.game_encoder import PokerGameEncoder import pprint class RaisedPlayer(BasePokerPlayer): + classencoder = None + def declare_action(self, valid_actions, hole_card, round_state): + self.classencoder.update(round_state, hole_card) + pp = pprint.PrettyPrinter(indent=2) + print("------------ROUND_STATE(RAISE)--------") + pp.pprint(round_state) + print("------------HOLE_CARD----------") + pp.pprint(hole_card) + print("------------VALID_ACTIONS----------") + pp.pprint(valid_actions) + print("------------FEATURES----------") + pp.pprint(self.classencoder.get_features()) + print("-------------------------------") for i in valid_actions: if i["action"] == "raise": action = i["action"] - return action # action returned here is sent to the poker engine + return action action = valid_actions[1]["action"] - return action # action returned here is sent to the poker engine + return action def receive_game_start_message(self, game_info): - pass + seats = game_info.get('seats') + for seat in seats: + if seat.get('name') == 'RaisedPlayer': + self_uuid = seat.get('uuid') + self_stack = seat.get('stack') + else: + op_uuid = seat.get('uuid') + op_stack = seat.get('stack') + self.classencoder = PokerGameEncoder(self_uuid, self_stack, op_uuid, op_stack, game_info) def receive_round_start_message(self, round_count, hole_card, seats): pass diff --git a/randomplayer.py b/randomplayer.py index 78bce72..de63cc7 100644 --- a/randomplayer.py +++ b/randomplayer.py @@ -1,19 +1,24 @@ from pypokerengine.players import BasePokerPlayer import random as rand +from game_encoder.game_encoder import PokerGameEncoder import pprint class RandomPlayer(BasePokerPlayer): + classencoder = None + def declare_action(self, valid_actions, hole_card, round_state): - # valid_actions format => [raise_action_pp = pprint.PrettyPrinter(indent=2) - #pp = pprint.PrettyPrinter(indent=2) - #print("------------ROUND_STATE(RANDOM)--------") - #pp.pprint(round_state) - #print("------------HOLE_CARD----------") - #pp.pprint(hole_card) - #print("------------VALID_ACTIONS----------") - #pp.pprint(valid_actions) - #print("-------------------------------") + self.classencoder.update(round_state, hole_card) + pp = pprint.PrettyPrinter(indent=2) + print("------------ROUND_STATE(RANDOM)--------") + pp.pprint(round_state) + print("------------HOLE_CARD----------") + pp.pprint(hole_card) + print("------------VALID_ACTIONS----------") + pp.pprint(valid_actions) + print("------------FEATURES----------") + pp.pprint(self.classencoder.get_features()) + print("-------------------------------") r = rand.random() if r <= 0.5: call_action_info = valid_actions[1] @@ -25,10 +30,20 @@ def declare_action(self, valid_actions, hole_card, round_state): return action # action returned here is sent to the poker engine def receive_game_start_message(self, game_info): - pass + seats = game_info.get('seats') + for seat in seats: + if seat.get('name') == 'Random Warrior 1': + self_uuid = seat.get('uuid') + self_stack = seat.get('stack') + else: + op_uuid = seat.get('uuid') + op_stack = seat.get('stack') + self.classencoder = PokerGameEncoder(self_uuid, self_stack, op_uuid, op_stack, game_info) def receive_round_start_message(self, round_count, hole_card, seats): - pass + pp = pprint.PrettyPrinter(indent=2) + print("------------NEW_ROUND_STATE(RANDOM)--------") + pp.pprint(seats) def receive_street_start_message(self, street, round_state): pass