1
- """
2
- """
3
1
from __future__ import annotations
4
2
5
3
import logging
6
-
7
- logging .basicConfig (filename = "test.txt" , level = logging .DEBUG )
4
+ import sys
8
5
9
6
from tqdm import trange
7
+ import numpy as np
10
8
11
9
from pluribus import utils
12
- from pluribus .games .short_deck .state import *
13
- from pluribus .games .short_deck .agent import *
10
+ from pluribus .games .short_deck .state import ShortDeckPokerState , new_game
11
+ from pluribus .games .short_deck .agent import Agent
12
+ sys .path .append ('../blueprint_algo' )
13
+ from blueprint_short_deck_poker import calculate_strategy , cfr , cfrp
14
14
15
15
16
16
def update_strategy (agent : Agent , state : ShortDeckPokerState , ph_test_node : int ):
17
17
"""
18
-
19
- :param state: the game state
20
- :param i: the player, i = 1 is always first to act and i = 2 is always second to act, but they take turns who
21
- updates the strategy (only one strategy)
22
- :return: nothing, updates action count in the strategy of actions chosen according to sigma, this simple choosing of
23
- actions is what allows the algorithm to build up preference for one action over another in a given spot
18
+ Update strategy for test node only
24
19
"""
25
- logging .debug ("UPDATE STRATEGY" )
26
- logging .debug ("########" )
27
-
28
- logging .debug (f"P(h): { state .player_i } " )
29
- logging .debug (f"Betting Round { state ._betting_stage } " )
30
- logging .debug (f"Community Cards { state ._table .community_cards } " )
31
- logging .debug (f"Player 0 hole cards: { state .players [0 ].cards } " )
32
- logging .debug (f"Player 1 hole cards: { state .players [1 ].cards } " )
33
- logging .debug (f"Player 2 hole cards: { state .players [2 ].cards } " )
34
- logging .debug (f"Betting Action Correct?: { state .players } " )
35
-
36
- ph = state .player_i # this is always the case no matter what i is
37
-
20
+ ph = state .player_i
38
21
if ph == ph_test_node :
39
- try :
40
- I = state .info_set
41
- except :
42
- import ipdb
43
-
44
- ipdb .set_trace ()
22
+ I = state .info_set
45
23
# calculate regret
46
- logging .debug (f"About to Calculate Strategy, Regret: { agent .regret [I ]} " )
47
- logging .debug (f"Current regret: { agent .regret [I ]} " )
48
24
sigma = calculate_strategy (agent .regret , I , state )
49
- logging .debug (f"Calculated Strategy for { I } : { sigma [I ]} " )
50
25
# choose an action based of sigma
51
26
try :
52
27
a = np .random .choice (list (sigma [I ].keys ()), 1 , p = list (sigma [I ].values ()))[0 ]
53
- logging .debug (f"ACTION SAMPLED: ph { state .player_i } ACTION: { a } " )
54
28
except ValueError :
55
29
p = 1 / len (state .legal_actions )
56
30
probabilities = np .full (len (state .legal_actions ), p )
57
31
a = np .random .choice (state .legal_actions , p = probabilities )
58
32
sigma [I ] = {action : p for action in state .legal_actions }
59
- logging .debug (f"ACTION SAMPLED: ph { state .player_i } ACTION: { a } " )
60
33
# Increment the action counter.
61
34
agent .strategy [I ][a ] += 1
62
- logging .debug (f"Updated Strategy for { I } : { agent .strategy [I ]} " )
63
35
return
64
36
else :
65
37
return
66
38
67
-
68
- def calculate_strategy (
69
- regret : Dict [str , Dict [str , float ]], I : str , state : ShortDeckPokerState ,
70
- ):
71
- """
72
-
73
- :param regret: dictionary of regrets, I is key, then each action at I, with values being regret
74
- :param sigma: dictionary of strategy updated by regret, iteration is key, then I is key, then each action with prob
75
- :param I:
76
- :param state: the game state
77
- :return: doesn't return anything, just updates sigma
78
- """
79
- sigma = collections .defaultdict (lambda : collections .defaultdict (lambda : 1 / 3 ))
80
- rsum = sum ([max (x , 0 ) for x in regret [I ].values ()])
81
- for a in state .legal_actions :
82
- if rsum > 0 :
83
- sigma [I ][a ] = max (regret [I ][a ], 0 ) / rsum
84
- else :
85
- sigma [I ][a ] = 1 / len (state .legal_actions )
86
- return sigma
87
-
88
-
89
- def cfr (agent : Agent , state : ShortDeckPokerState , i : int , t : int ) -> float :
90
- """
91
- regular cfr algo
92
-
93
- :param state: the game state
94
- :param i: player
95
- :param t: iteration
96
- :return: expected value for node for player i
97
- """
98
- logging .debug ("CFR" )
99
- logging .debug ("########" )
100
- logging .debug (f"Iteration: { t } " )
101
- logging .debug (f"Player Set to Update Regret: { i } " )
102
- logging .debug (f"P(h): { state .player_i } " )
103
- logging .debug (f"P(h) Updating Regret? { state .player_i == i } " )
104
- logging .debug (f"Betting Round { state ._betting_stage } " )
105
- logging .debug (f"Community Cards { state ._table .community_cards } " )
106
- logging .debug (f"Player 0 hole cards: { state .players [0 ].cards } " )
107
- logging .debug (f"Player 1 hole cards: { state .players [1 ].cards } " )
108
- logging .debug (f"Player 2 hole cards: { state .players [2 ].cards } " )
109
- logging .debug (f"Betting Action Correct?: { state .players } " )
110
-
111
- ph = state .player_i
112
-
113
- player_not_in_hand = not state .players [i ].is_active
114
- if state .is_terminal or player_not_in_hand :
115
- return state .payout [i ]
116
-
117
- # NOTE(fedden): The logic in Algorithm 1 in the supplementary material
118
- # instructs the following lines of logic, but state class
119
- # will already skip to the next in-hand player.
120
- # elif p_i not in hand:
121
- # cfr()
122
- # NOTE(fedden): According to Algorithm 1 in the supplementary material,
123
- # we would add in the following bit of logic. However we
124
- # already have the game logic embedded in the state class,
125
- # and this accounts for the chance samplings. In other words,
126
- # it makes sure that chance actions such as dealing cards
127
- # happen at the appropriate times.
128
- # elif h is chance_node:
129
- # sample action from strategy for h
130
- # cfr()
131
-
132
- elif ph == i :
133
- try :
134
- I = state .info_set
135
- except :
136
- import ipdb
137
-
138
- ipdb .set_trace ()
139
- # calculate strategy
140
- logging .debug (f"About to Calculate Strategy, Regret: { agent .regret [I ]} " )
141
- logging .debug (f"Current regret: { agent .regret [I ]} " )
142
- sigma = calculate_strategy (agent .regret , I , state )
143
- logging .debug (f"Calculated Strategy for { I } : { sigma [I ]} " )
144
-
145
- vo = 0.0
146
- voa = {}
147
- for a in state .legal_actions :
148
- logging .debug (
149
- f"ACTION TRAVERSED FOR REGRET: ph { state .player_i } ACTION: { a } "
150
- )
151
- new_state : ShortDeckPokerState = state .apply_action (a )
152
- voa [a ] = cfr (agent , new_state , i , t )
153
- logging .debug (f"Got EV for { a } : { voa [a ]} " )
154
- vo += sigma [I ][a ] * voa [a ]
155
- logging .debug (
156
- f"""Added to Node EV for ACTION: { a } INFOSET: { I }
157
- STRATEGY: { sigma [I ][a ]} : { sigma [I ][a ] * voa [a ]} """
158
- )
159
- logging .debug (f"Updated EV at { I } : { vo } " )
160
-
161
- for a in state .legal_actions :
162
- agent .regret [I ][a ] += voa [a ] - vo
163
- logging .debug (f"Updated Regret at { I } : { agent .regret [I ]} " )
164
-
165
- return vo
166
- else :
167
- # import ipdb;
168
- # ipdb.set_trace()
169
- try :
170
- Iph = state .info_set
171
- except :
172
- import ipdb
173
-
174
- ipdb .set_trace ()
175
- logging .debug (f"About to Calculate Strategy, Regret: { agent .regret [Iph ]} " )
176
- logging .debug (f"Current regret: { agent .regret [Iph ]} " )
177
- sigma = calculate_strategy (agent .regret , Iph , state )
178
- logging .debug (f"Calculated Strategy for { Iph } : { sigma [Iph ]} " )
179
-
180
- try :
181
- a = np .random .choice (
182
- list (sigma [Iph ].keys ()), 1 , p = list (sigma [Iph ].values ()),
183
- )[0 ]
184
- logging .debug (f"ACTION SAMPLED: ph { state .player_i } ACTION: { a } " )
185
-
186
- except ValueError :
187
- p = 1 / len (state .legal_actions )
188
- probabilities = np .full (len (state .legal_actions ), p )
189
- a = np .random .choice (state .legal_actions , p = probabilities )
190
- sigma [Iph ] = {action : p for action in state .legal_actions }
191
- logging .debug (f"ACTION SAMPLED: ph { state .player_i } ACTION: { a } " )
192
-
193
- new_state : ShortDeckPokerState = state .apply_action (a )
194
- return cfr (agent , new_state , i , t )
195
-
196
-
197
39
def train (
198
40
offline_strategy : Dict ,
199
41
public_cards : list ,
@@ -206,10 +48,12 @@ def train(
206
48
update_threshold : int ,
207
49
):
208
50
"""Train agent."""
51
+ # TODO: fix the seed
209
52
utils .random .seed (36 )
210
53
agent = Agent ()
211
54
212
- state : ShortDeckPokerState = new_game (3 , real_time_test = True , public_cards = public_cards )
55
+ state : ShortDeckPokerState = new_game (3 , real_time_test = True ,
56
+ public_cards = public_cards )
213
57
current_game_state : ShortDeckPokerState = state .load_game_state (
214
58
offline_strategy ,
215
59
action_sequence
0 commit comments