diff --git a/chapter_reinforcement-learning/qlearning.md b/chapter_reinforcement-learning/qlearning.md index e9c4b0d4c1..c0cbe87327 100644 --- a/chapter_reinforcement-learning/qlearning.md +++ b/chapter_reinforcement-learning/qlearning.md @@ -1,3 +1,4 @@ + ```{.python .input} %load_ext d2lbook.tab tab.interact_select(["pytorch"]) @@ -132,9 +133,12 @@ def q_learning(env_info, gamma, num_iters, alpha, epsilon): action = e_greedy(env, Q, state, epsilon) next_state, reward, done, _ = env.step(action) - # Q-update: - y = reward + gamma * np.max(Q[next_state,:]) - Q[state, action] = Q[state, action] + alpha * (y - Q[state, action]) + # Q-learning new update: Q(s,a) ← Q(s,a) + α[r + γ max Q(s',a') − Q(s,a)] + #corrected Q -block code + td_target = reward + gamma * np.max(Q[next_state, :]) + td_error = td_target - Q[state, action] + Q[state, action] += alpha * td_error + # Move to the next state state = next_state