CompSci683 · skinreich11 · Mar 18, 2025 · Mar 18, 2025 · Apr 27, 2025
diff --git a/deep_q/deep_q_session.py b/deep_q/deep_q_session.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+import numpy as np
+import torch
+from q_model import QModel
+
+# Utility function. Outside of QSession because requires knowledge of actions that would take place
+def get_maximal_Q(session: QSession, s_prime):
+
+    # Get all of the actions that can be taken from s_prime
+    actions = []
+
+    action_Qs = np.zeros(len(actions))
+
+    for i, a in enumerate(actions):
+        action_Qs[i] = session.get_Q(session.encode(s_prime, a))
+
+    return action_Qs.max()
+
+class QModel(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, data):
+        return data
+
+class QSession:
+
+    def __init__(self):
+
+        # Initialized in either new_model or load_model
+        self.model: torch.nn.Module = None
+        self.optimizer = None
+
+        # This will likely just hold a list of tensors
+        self.replay_buffer = []
+
+        # The weight of future rewards. Set this to something reasonable
+        self.gamma = 0.9
+
+    # Somehow encodes a state and action into a tensor
+    def encode(self, state, action, reward, s_prime) -> torch.Tensor:
+        pass
+
+    # Set this to a reasonable loss function
+    def loss_function(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return (x-y).square().mean()
+
+    # Will store in replay_buffer 
+    def register_in_buffer(self, state, action, reward, s_prime):
+        self.replay_buffer(
+            [state, action, reward, s_prime]
+        )
+
+    # This trains network on values seen in the past to regularize 
+    def train_on_past_values(self, batch_size):
+
+        # Sample random values from replay_buffer
+
+        samples = np.random.randint(0,len(self.replay_buffer), size=batch_size)
+
+        for i in samples:
+            self.train(*self.replay_buffer[i], new=False)
+
+    # Updates network based on single step. 
+    def train(self, state, action, reward, s_prime, new = True) -> torch.Tensor :
+
+        if(new): self.register_in_buffer(state, action, reward, s_prime)
+
+        input_encoded = session.encode(state, action, reward, s_prime)
+
+        # Forward pass of the network
+        pred_Q = self.model(input_encoded)
+
+        # Get Q target of network
+        target_Q = reward + get_maximal_Q(self, s_prime)*self.gamma
+
+        # Calculate loss
+        loss = self.loss_function(pred_Q, target_Q)
+
+        # Update weight
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        if(new): self.train_on_past_values()
+
+        # Return loss
+        return loss
+
+    def get_Q(self, state, action):
+
+        with torch.no_grad():
+            pred_Q = self.model(self.encode(state, action))
+
+        return pred_Q
+
+    def new_model(self):
+        self.model = QModel()
+        self.optimizer = torch.optim.Adam(
+            filter(lambda x: x.requires_grad, self.model.parameters()),
+            lr=0.001,
+        )
+
+    def save_model(self, save_path):
+        torch.save(self.model, save_path)
+
+    def load_model(self, model_filepath):
+        self.model = torch.load(model_filepath)
+        self.optimizer = torch.optim.Adam(
+            filter(lambda x: x.requires_grad, self.model.parameters()),
+            lr=0.001,
+        )
+
+if __name__ == "__main__":
+
+    # Example instance
+    session = QSession()
+    session.new_model()
+
+    # Agent will traverse the state space. As it traverses, gets states and actions, and rewards from environement.
+    # Run this as many times as you want, exploring different parts of state space
+    state, action, reward, s_prime = None, None, None, None
+    session.train(state, action, reward, s_prime)
+
+    # Save model trained somewhere
+    save_path = ""
+    session.save_model(save_path)
+
+    # Load later
+    session.load_model(save_path)
+
+    # Keep training, or go ahead and actually use by calling forward_no_grad
+    pred_Q = session.get_Q(state, action)
diff --git a/game_encoder/__init__.py b/game_encoder/__init__.py
diff --git a/game_encoder/game_encoder.py b/game_encoder/game_encoder.py
@@ -0,0 +1,97 @@
+import numpy as np
+
+class PokerGameEncoder:
+    _suit_offset = {'C': 0, 'D': 13, 'H': 26, 'S': 39}
+    _rank_value = {'2':0, '3':1, '4':2, '5':3, '6':4, '7':5, '8':6, '9':7, 'T':8, 'J':9, 'Q':10, 'K':11, 'A':12}
+    _pre_rounds = {'river': 'turn', 'turn': 'flop', 'flop': 'preflop'}
+
+    def __init__(self, our_uuid, our_stack, op_uuid, op_stack, game_info):
+        self.our_uuid = our_uuid
+        self.opponent_uuid = op_uuid
+        self.hole_cards_vector = np.zeros(52, dtype=np.float32)
+        self.community_cards_vector = np.zeros(52, dtype=np.float32)
+        self.our_total_money = our_stack
+        self.opponent_total_money = op_stack
+        self.our_investment = 0
+        self.opponent_investment = 0
+        self.round_count = 1
+        self.seats = game_info.get('seats')
+        self.prev_our_pot = our_stack
+        self.prev_op_pot = op_stack
+
+    @classmethod
+    def card_to_index(cls, card):
+        return cls._suit_offset[card[0]] + cls._rank_value[card[1]]
+
+    @classmethod
+    def encode_cards(cls, cards):
+        vec = np.zeros(52, dtype=np.float32)
+        for card in cards:
+            vec[cls.card_to_index(card)] = 1.0
+        return vec
+
+    def get_stack(self, uuid, new_game_data):
+        for seat in new_game_data.get('seats'):
+            if seat['uuid'] == uuid:
+                return seat['stack']
+        raise ValueError(f"UUID {uuid} not found in seats")
+
+    def update(self, new_game_data, new_hole_cards=None):
+        if new_hole_cards:
+            self.hole_cards_vector = self.encode_cards(new_hole_cards)
+        self.community_cards_vector = self.encode_cards(new_game_data.get('community_card', []))
+
+        if new_game_data['round_count'] != self.round_count:
+            self.round_count = new_game_data['round_count']
+            small_blind_pos = new_game_data['small_blind_pos']
+            small_blind_amount = new_game_data['small_blind_amount']
+
+            small_blind_uuid = new_game_data['seats'][small_blind_pos]['uuid']
+
+            if new_game_data.get('action_histories').get('preflop')[-1].get('action') == 'BIGBLIND':
+                if self.our_uuid == small_blind_uuid:
+                    self.our_investment = small_blind_amount
+                    self.opponent_investment = 2 * small_blind_amount
+                    self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + small_blind_amount
+                    self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + 2 * small_blind_amount
+                    self.our_total_money = self.get_stack(self.our_uuid, new_game_data)
+                    self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data)
+                else:
+                    self.our_investment = 2 * small_blind_amount
+                    self.opponent_investment = small_blind_amount
+                    self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + 2 * small_blind_amount
+                    self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + small_blind_amount
+                    self.our_total_money = self.get_stack(self.our_uuid, new_game_data)
+                    self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data)
+            else:
+                last_opponent_paid = new_game_data.get('action_histories').get('preflop')[-1].get('paid')
+                if self.our_uuid == small_blind_uuid:
+                    self.our_investment = small_blind_amount
+                    self.opponent_investment = 2 * small_blind_amount + last_opponent_paid
+                    self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + small_blind_amount
+                    self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + 2 * small_blind_amount + last_opponent_paid
+                    self.our_total_money = self.get_stack(self.our_uuid, new_game_data)
+                    self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data)
+                else:
+                    self.our_investment = 2 * small_blind_amount
+                    self.opponent_investment = small_blind_amount + last_opponent_paid
+                    self.prev_our_pot = self.get_stack(self.our_uuid, new_game_data) + 2 * small_blind_amount 
+                    self.prev_op_pot = self.get_stack(self.opponent_uuid, new_game_data) + small_blind_amount + last_opponent_paid
+                    self.our_total_money = self.get_stack(self.our_uuid, new_game_data)
+                    self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data)
+        else:
+            self.round_count = new_game_data['round_count']
+            self.our_total_money = self.get_stack(self.our_uuid, new_game_data)
+            self.opponent_total_money = self.get_stack(self.opponent_uuid, new_game_data)
+            self.our_investment = self.prev_our_pot - self.our_total_money
+            self.opponent_investment = self.prev_op_pot - self.opponent_total_money
+
+    def get_features(self):
+        return {
+            'hole_cards_vector': self.hole_cards_vector,
+            'community_cards_vector': self.community_cards_vector,
+            'our_total_money': self.our_total_money,
+            'opponent_total_money': self.opponent_total_money,
+            'our_investment_this_round': self.our_investment,
+            'opponent_investment_this_round': self.opponent_investment,
+        }
diff --git a/raise_player.py b/raise_player.py
@@ -1,19 +1,41 @@
 from pypokerengine.players import BasePokerPlayer
 from time import sleep
+from game_encoder.game_encoder import PokerGameEncoder
 import pprint
 
 class RaisedPlayer(BasePokerPlayer):
 
+  classencoder = None
+
   def declare_action(self, valid_actions, hole_card, round_state):
+    self.classencoder.update(round_state, hole_card)
+    pp = pprint.PrettyPrinter(indent=2)
+    print("------------ROUND_STATE(RAISE)--------")
+    pp.pprint(round_state)
+    print("------------HOLE_CARD----------")
+    pp.pprint(hole_card)
+    print("------------VALID_ACTIONS----------")
+    pp.pprint(valid_actions)
+    print("------------FEATURES----------")
+    pp.pprint(self.classencoder.get_features())
+    print("-------------------------------")
     for i in valid_actions:
         if i["action"] == "raise":
             action = i["action"]
-            return action  # action returned here is sent to the poker engine
+            return action
     action = valid_actions[1]["action"]
-    return action # action returned here is sent to the poker engine
+    return action
 
   def receive_game_start_message(self, game_info):
-    pass
+    seats = game_info.get('seats')
+    for seat in seats:
+      if seat.get('name') == 'RaisedPlayer':
+        self_uuid = seat.get('uuid')
+        self_stack = seat.get('stack')
+      else:
+        op_uuid = seat.get('uuid')
+        op_stack = seat.get('stack')
+    self.classencoder = PokerGameEncoder(self_uuid, self_stack, op_uuid, op_stack, game_info)
 
   def receive_round_start_message(self, round_count, hole_card, seats):
     pass

diff --git a/randomplayer.py b/randomplayer.py
@@ -1,19 +1,24 @@
 from pypokerengine.players import BasePokerPlayer
 import random as rand
+from game_encoder.game_encoder import PokerGameEncoder
 import pprint
 
 class RandomPlayer(BasePokerPlayer):
 
+  classencoder = None
+
   def declare_action(self, valid_actions, hole_card, round_state):
-    # valid_actions format => [raise_action_pp = pprint.PrettyPrinter(indent=2)
-    #pp = pprint.PrettyPrinter(indent=2)
-    #print("------------ROUND_STATE(RANDOM)--------")
-    #pp.pprint(round_state)
-    #print("------------HOLE_CARD----------")
-    #pp.pprint(hole_card)
-    #print("------------VALID_ACTIONS----------")
-    #pp.pprint(valid_actions)
-    #print("-------------------------------")
+    self.classencoder.update(round_state, hole_card)
+    pp = pprint.PrettyPrinter(indent=2)
+    print("------------ROUND_STATE(RANDOM)--------")
+    pp.pprint(round_state)
+    print("------------HOLE_CARD----------")
+    pp.pprint(hole_card)
+    print("------------VALID_ACTIONS----------")
+    pp.pprint(valid_actions)
+    print("------------FEATURES----------")
+    pp.pprint(self.classencoder.get_features())
+    print("-------------------------------")
     r = rand.random()
     if r <= 0.5:
       call_action_info = valid_actions[1]
@@ -25,10 +30,20 @@ def declare_action(self, valid_actions, hole_card, round_state):
     return action  # action returned here is sent to the poker engine
 
   def receive_game_start_message(self, game_info):
-    pass
+    seats = game_info.get('seats')
+    for seat in seats:
+      if seat.get('name') == 'Random Warrior 1':
+        self_uuid = seat.get('uuid')
+        self_stack = seat.get('stack')
+      else:
+        op_uuid = seat.get('uuid')
+        op_stack = seat.get('stack')
+    self.classencoder = PokerGameEncoder(self_uuid, self_stack, op_uuid, op_stack, game_info)
 
   def receive_round_start_message(self, round_count, hole_card, seats):
-    pass
+    pp = pprint.PrettyPrinter(indent=2)
+    print("------------NEW_ROUND_STATE(RANDOM)--------")
+    pp.pprint(seats)
 
   def receive_street_start_message(self, street, round_state):
     pass