Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit 2bbb7e9

Browse files
committed
cleaning up some errors
1 parent cd48a73 commit 2bbb7e9

File tree

2 files changed

+21
-179
lines changed

2 files changed

+21
-179
lines changed

research/test_methodology/RT.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,25 @@
1+
from typing import List
2+
13
import dill as pickle
24

3-
from RT_cfr import *
4-
from pluribus.games.short_deck.state import *
5-
from pluribus.games.short_deck.agent import *
5+
from RT_cfr import train
6+
from pluribus.games.short_deck.agent import TrainedAgent
67
from pluribus.poker.card import Card
78

89

910
if __name__ == "__main__":
10-
# public_cards = [Card("ace", "spades"), Card("queen", "spades"), Card("queen", "hearts")]
11-
public_cards = []
11+
# public_cards = [Card("ace", "spades"), Card("queen", "spades"),
12+
# Card("queen", "hearts")]
13+
public_cards: List[Card] = []
1214
# we load a (trained) strategy
1315
agent1 = TrainedAgent("../blueprint_algo/results_2020_05_10_21_36_47_291425")
14-
# sorta hacky, but I loaded the average strategy above, now I'm replacing with
1516
# the better strategy
1617
# offline_strategy = joblib.load('/Users/colin/Downloads/offline_strategy_285800.gz')
17-
# print(sys.getsizeof(offline_strategy))
18-
# agent1.offline_strategy = offline_strategy
19-
# print(sys.getsizeof(agent1.offline_strategy))
2018
action_sequence = ["raise", "call", "call", "call", "call"]
2119
agent_output = train(
2220
agent1.offline_strategy, public_cards, action_sequence, 40, 6, 6, 3, 2, 6
23-
) # TODO: back to 50
24-
with open("realtime-strategy-refactor-deck.pkl", "wb") as file:
21+
)
22+
with open("testing2.pkl", "wb") as file:
2523
pickle.dump(agent_output, file)
2624
import ipdb
2725
ipdb.set_trace()

research/test_methodology/RT_cfr.py

+12-168
Original file line numberDiff line numberDiff line change
@@ -1,199 +1,41 @@
1-
"""
2-
"""
31
from __future__ import annotations
42

53
import logging
6-
7-
logging.basicConfig(filename="test.txt", level=logging.DEBUG)
4+
import sys
85

96
from tqdm import trange
7+
import numpy as np
108

119
from pluribus import utils
12-
from pluribus.games.short_deck.state import *
13-
from pluribus.games.short_deck.agent import *
10+
from pluribus.games.short_deck.state import ShortDeckPokerState, new_game
11+
from pluribus.games.short_deck.agent import Agent
12+
sys.path.append('../blueprint_algo')
13+
from blueprint_short_deck_poker import calculate_strategy, cfr, cfrp
1414

1515

1616
def update_strategy(agent: Agent, state: ShortDeckPokerState, ph_test_node: int):
1717
"""
18-
19-
:param state: the game state
20-
:param i: the player, i = 1 is always first to act and i = 2 is always second to act, but they take turns who
21-
updates the strategy (only one strategy)
22-
:return: nothing, updates action count in the strategy of actions chosen according to sigma, this simple choosing of
23-
actions is what allows the algorithm to build up preference for one action over another in a given spot
18+
Update strategy for test node only
2419
"""
25-
logging.debug("UPDATE STRATEGY")
26-
logging.debug("########")
27-
28-
logging.debug(f"P(h): {state.player_i}")
29-
logging.debug(f"Betting Round {state._betting_stage}")
30-
logging.debug(f"Community Cards {state._table.community_cards}")
31-
logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
32-
logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
33-
logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
34-
logging.debug(f"Betting Action Correct?: {state.players}")
35-
36-
ph = state.player_i # this is always the case no matter what i is
37-
20+
ph = state.player_i
3821
if ph == ph_test_node:
39-
try:
40-
I = state.info_set
41-
except:
42-
import ipdb
43-
44-
ipdb.set_trace()
22+
I = state.info_set
4523
# calculate regret
46-
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
47-
logging.debug(f"Current regret: {agent.regret[I]}")
4824
sigma = calculate_strategy(agent.regret, I, state)
49-
logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")
5025
# choose an action based of sigma
5126
try:
5227
a = np.random.choice(list(sigma[I].keys()), 1, p=list(sigma[I].values()))[0]
53-
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
5428
except ValueError:
5529
p = 1 / len(state.legal_actions)
5630
probabilities = np.full(len(state.legal_actions), p)
5731
a = np.random.choice(state.legal_actions, p=probabilities)
5832
sigma[I] = {action: p for action in state.legal_actions}
59-
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
6033
# Increment the action counter.
6134
agent.strategy[I][a] += 1
62-
logging.debug(f"Updated Strategy for {I}: {agent.strategy[I]}")
6335
return
6436
else:
6537
return
6638

67-
68-
def calculate_strategy(
69-
regret: Dict[str, Dict[str, float]], I: str, state: ShortDeckPokerState,
70-
):
71-
"""
72-
73-
:param regret: dictionary of regrets, I is key, then each action at I, with values being regret
74-
:param sigma: dictionary of strategy updated by regret, iteration is key, then I is key, then each action with prob
75-
:param I:
76-
:param state: the game state
77-
:return: doesn't return anything, just updates sigma
78-
"""
79-
sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3))
80-
rsum = sum([max(x, 0) for x in regret[I].values()])
81-
for a in state.legal_actions:
82-
if rsum > 0:
83-
sigma[I][a] = max(regret[I][a], 0) / rsum
84-
else:
85-
sigma[I][a] = 1 / len(state.legal_actions)
86-
return sigma
87-
88-
89-
def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
90-
"""
91-
regular cfr algo
92-
93-
:param state: the game state
94-
:param i: player
95-
:param t: iteration
96-
:return: expected value for node for player i
97-
"""
98-
logging.debug("CFR")
99-
logging.debug("########")
100-
logging.debug(f"Iteration: {t}")
101-
logging.debug(f"Player Set to Update Regret: {i}")
102-
logging.debug(f"P(h): {state.player_i}")
103-
logging.debug(f"P(h) Updating Regret? {state.player_i == i}")
104-
logging.debug(f"Betting Round {state._betting_stage}")
105-
logging.debug(f"Community Cards {state._table.community_cards}")
106-
logging.debug(f"Player 0 hole cards: {state.players[0].cards}")
107-
logging.debug(f"Player 1 hole cards: {state.players[1].cards}")
108-
logging.debug(f"Player 2 hole cards: {state.players[2].cards}")
109-
logging.debug(f"Betting Action Correct?: {state.players}")
110-
111-
ph = state.player_i
112-
113-
player_not_in_hand = not state.players[i].is_active
114-
if state.is_terminal or player_not_in_hand:
115-
return state.payout[i]
116-
117-
# NOTE(fedden): The logic in Algorithm 1 in the supplementary material
118-
# instructs the following lines of logic, but state class
119-
# will already skip to the next in-hand player.
120-
# elif p_i not in hand:
121-
# cfr()
122-
# NOTE(fedden): According to Algorithm 1 in the supplementary material,
123-
# we would add in the following bit of logic. However we
124-
# already have the game logic embedded in the state class,
125-
# and this accounts for the chance samplings. In other words,
126-
# it makes sure that chance actions such as dealing cards
127-
# happen at the appropriate times.
128-
# elif h is chance_node:
129-
# sample action from strategy for h
130-
# cfr()
131-
132-
elif ph == i:
133-
try:
134-
I = state.info_set
135-
except:
136-
import ipdb
137-
138-
ipdb.set_trace()
139-
# calculate strategy
140-
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[I]}")
141-
logging.debug(f"Current regret: {agent.regret[I]}")
142-
sigma = calculate_strategy(agent.regret, I, state)
143-
logging.debug(f"Calculated Strategy for {I}: {sigma[I]}")
144-
145-
vo = 0.0
146-
voa = {}
147-
for a in state.legal_actions:
148-
logging.debug(
149-
f"ACTION TRAVERSED FOR REGRET: ph {state.player_i} ACTION: {a}"
150-
)
151-
new_state: ShortDeckPokerState = state.apply_action(a)
152-
voa[a] = cfr(agent, new_state, i, t)
153-
logging.debug(f"Got EV for {a}: {voa[a]}")
154-
vo += sigma[I][a] * voa[a]
155-
logging.debug(
156-
f"""Added to Node EV for ACTION: {a} INFOSET: {I}
157-
STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}"""
158-
)
159-
logging.debug(f"Updated EV at {I}: {vo}")
160-
161-
for a in state.legal_actions:
162-
agent.regret[I][a] += voa[a] - vo
163-
logging.debug(f"Updated Regret at {I}: {agent.regret[I]}")
164-
165-
return vo
166-
else:
167-
# import ipdb;
168-
# ipdb.set_trace()
169-
try:
170-
Iph = state.info_set
171-
except:
172-
import ipdb
173-
174-
ipdb.set_trace()
175-
logging.debug(f"About to Calculate Strategy, Regret: {agent.regret[Iph]}")
176-
logging.debug(f"Current regret: {agent.regret[Iph]}")
177-
sigma = calculate_strategy(agent.regret, Iph, state)
178-
logging.debug(f"Calculated Strategy for {Iph}: {sigma[Iph]}")
179-
180-
try:
181-
a = np.random.choice(
182-
list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()),
183-
)[0]
184-
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
185-
186-
except ValueError:
187-
p = 1 / len(state.legal_actions)
188-
probabilities = np.full(len(state.legal_actions), p)
189-
a = np.random.choice(state.legal_actions, p=probabilities)
190-
sigma[Iph] = {action: p for action in state.legal_actions}
191-
logging.debug(f"ACTION SAMPLED: ph {state.player_i} ACTION: {a}")
192-
193-
new_state: ShortDeckPokerState = state.apply_action(a)
194-
return cfr(agent, new_state, i, t)
195-
196-
19739
def train(
19840
offline_strategy: Dict,
19941
public_cards: list,
@@ -206,10 +48,12 @@ def train(
20648
update_threshold: int,
20749
):
20850
"""Train agent."""
51+
# TODO: fix the seed
20952
utils.random.seed(36)
21053
agent = Agent()
21154

212-
state: ShortDeckPokerState = new_game(3, real_time_test=True, public_cards=public_cards)
55+
state: ShortDeckPokerState = new_game(3, real_time_test=True,
56+
public_cards=public_cards)
21357
current_game_state: ShortDeckPokerState = state.load_game_state(
21458
offline_strategy,
21559
action_sequence

0 commit comments

Comments
 (0)