-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathD4PG.py
More file actions
255 lines (208 loc) · 10.1 KB
/
Copy pathD4PG.py
File metadata and controls
255 lines (208 loc) · 10.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import numpy as np
import gym
from gym import spaces
import random
import gym
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
class MultiDimensionalStochasticProgrammingEnv(gym.Env):
def __init__(self, num_products=3):
super(MultiDimensionalStochasticProgrammingEnv, self).__init__()
self.num_products = num_products
# 动作空间:每个产品的补货数量(0到10)
self.action_space = spaces.Box(low=0, high=300, shape=(num_products,), dtype=np.float32)
self.penalty = 1e8
# 状态空间:每个产品的当前库存数量
self.observation_space = spaces.Box(low=0.0, high=300.0, shape=(num_products,), dtype=np.float32)
self.cost = np.full(num_products,10)
self.max_inventory = 300 # 每个产品的最大库存
self.current_inventory = np.full(num_products, 0) # 初始库存
self.total_reward = 0
self.profit_per_unit = 50 # 每个产品的利润
self.store_cost_per_unit = 5 # 每个产品的储存成本
self.day = 0 # 天数计数器
self.max_days = 100 # 设定最大天数
self.bench = 150*self.num_products*10
def reset(self):
self.current_inventory = np.full(self.num_products, 0) # 重置每个产品的库存
self.total_reward = 0
self.day = 0 # 每个 episode 重置天数计数器
return self.current_inventory
def step(self, action):
self.current_inventory = np.minimum(self.max_inventory, self.current_inventory + action,np.full(self.num_products, 0).astype('float64'))
self.cost = np.random.normal(loc =10,scale = 1,size = self.num_products)
self.day += 1 # 计数天数增加
# 随机生成每个产品的需求
demand = np.random.normal(loc=150, scale=1, size=self.num_products)
whole_cost = np.dot(action,self.cost)
# 实际满足的需求
satisfied_demand = np.minimum(self.current_inventory, demand)
wei_demand = np.maximum(demand - satisfied_demand, 0)
self.current_inventory = self.current_inventory.astype(np.float64) # 确保类型为 float64
self.current_inventory -= satisfied_demand
pe = self.penalty * (1 if whole_cost > self.bench else 0)
# 奖励为满足的需求减去储存成本
reward = np.sum(satisfied_demand * self.profit_per_unit)
self.total_reward += reward
ifc = (True if whole_cost < self.bench else False)
# 当天数达到最大天数时,结束 episode
done = self.day >= self.max_days
return self.current_inventory, reward, done,{}
def render(self, mode='human'):
print(f"Current Inventory: {self.current_inventory}, Total Reward: {self.total_reward}, Day: {self.day}")
import torch
import torch.nn.functional as F
import torch.multiprocessing as mp
import numpy as np
import random
from collections import deque
class ValueNet(torch.nn.Module):
def __init__(self, state_dim, hidden_dim):
super(ValueNet, self).__init__()
self.fc1 = torch.nn.Linear(state_dim, hidden_dim*2)
self.fc2 = torch.nn.Linear(hidden_dim*2, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
return self.fc2(x)
class PolicyNetContinuous(torch.nn.Module):
def __init__(self, state_dim, hidden_dim, action_dim):
super(PolicyNetContinuous, self).__init__()
self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
self.fc_std = torch.nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
mu = F.relu(torch.relu(self.fc_mu(x)))
std = F.softplus(self.fc_std(x)) + 1e-6 # Ensure positivity
return mu, std
class GlobalNet(torch.nn.Module):
'''Global Actor-Critic Network'''
def __init__(self, state_dim, hidden_dim, action_dim):
super(GlobalNet, self).__init__()
self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim)
self.critic = ValueNet(state_dim, hidden_dim)
def forward(self, x):
mu, std = self.actor(x)
value = self.critic(x)
return mu, std, value
class ReplayBuffer:
def __init__(self, buffer_size=100000, batch_size=200):
self.buffer = deque(maxlen=buffer_size)
self.batch_size = batch_size
def add(self, experience):
self.buffer.append(experience)
def sample(self):
batch = random.sample(self.buffer, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
return (
torch.tensor(states, dtype=torch.float32),
torch.tensor(actions, dtype=torch.float32),
torch.tensor(rewards, dtype=torch.float32),
torch.tensor(next_states, dtype=torch.float32),
torch.tensor(dones, dtype=torch.float32),
)
def __len__(self):
return len(self.buffer)
class Worker(mp.Process):
def __init__(self, global_net, optimizer, gamma, lmbda, max_steps, device, worker_id, replay_buffer):
super(Worker, self).__init__()
self.global_net = global_net
self.optimizer = optimizer
self.gamma = gamma
self.lmbda = lmbda
self.max_steps = max_steps
self.device = device
self.worker_id = worker_id
self.replay_buffer = replay_buffer
self.env = MultiDimensionalStochasticProgrammingEnv(num_products=10)
self.local_net = GlobalNet(self.env.observation_space.shape[0], hidden_dim, self.env.action_space.shape[0]).to(device)
def run(self):
for episode in range(self.max_steps):
state = self.env.reset()
done = False
whole_reward = 0
while not done:
state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
mu, std = self.local_net.actor(state_tensor)
action = torch.normal(mu, std).cpu().detach().numpy()
next_state, reward, done, _ = self.env.step(action.tolist())
self.replay_buffer.add((state, action, reward, next_state, done))
state = next_state
whole_reward+=reward
if done and episode%100==0:
state = self.env.reset()
print(f"{episode}, Worker {self.worker_id}: Episode reward: {whole_reward}")
whole_reward = 0
# Optimize after collecting enough experiences
if len(self.replay_buffer) >= self.replay_buffer.batch_size:
self.optimize()
def optimize(self):
states, actions, rewards, next_states, dones = self.replay_buffer.sample()
states = torch.FloatTensor(states).to(self.device)
actions = torch.FloatTensor(actions).to(self.device)
rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
next_states = torch.FloatTensor(next_states).to(self.device)
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
# Value update with distributional Bellman equation
mu, std, value = self.local_net(states)
with torch.no_grad():
next_mu, next_std, next_value = self.local_net(next_states)
td_target = rewards + self.gamma * next_value
# 将值函数视为正态分布
value_dist = torch.distributions.Normal(value, std)
target_dist = torch.distributions.Normal(td_target, std)
# 使用 KL 散度度量
loss_critic = F.l1_loss(value, td_target)
self.optimizer.zero_grad()
loss_critic.backward()
for global_param, local_param in zip(self.global_net.parameters(), self.local_net.parameters()):
global_param.grad = local_param.grad
self.optimizer.step()
# Update policy (actor)
mu, std = self.local_net.actor(states) # 当前策略的均值和标准差
old_mu, old_std = self.global_net.actor(states) # 旧策略的均值和标准差(来自全局网络,或冻结的旧网络)
# KL 散度计算
kl_divergence = (
old_std.log() - std.log() # log(sigma_old / sigma_new)
+ (std.pow(2) + (mu - old_mu).pow(2)) / (2.0 * old_std.pow(2)) # 第二项
- 0.5 # 减去常数
).sum(dim=-1) # 按动作维度求和
# Advantage 计算
advantage = (td_target - value.detach()).squeeze(-1) # Advantage:TD 目标减去状态值
# 总损失:KL 损失加权 Advantage
loss_actor = (kl_divergence * advantage).mean()
# 优化步骤
self.optimizer.zero_grad()
loss_actor.backward()
for global_param, local_param in zip(self.global_net.actor.parameters(), self.local_net.actor.parameters()):
global_param.grad = local_param.grad
self.optimizer.step()
def train_a3c(state_dim, action_dim, hidden_dim, actor_lr, critic_lr, gamma, lmbda, max_steps, num_workers):
device = torch.device("cpu")
global_net = GlobalNet(state_dim, hidden_dim, action_dim).to(device)
global_net.share_memory() # Share parameters across processes
optimizer = torch.optim.Adam(global_net.parameters(), lr=actor_lr)
# Create worker processes
workers = [Worker(global_net, optimizer, gamma, lmbda, 5000, device, i, ReplayBuffer()) for i in range(num_workers)]
# Start workers
for worker in workers:
worker.start()
for worker in workers:
worker.join()
if __name__ == "__main__":
env = MultiDimensionalStochasticProgrammingEnv(num_products=10)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
hidden_dim = 128 # 隐藏层维度
actor_lr = 1e-4 # Actor学习率
critic_lr = 1e-3 # Critic学习率
gamma = 0.99 # 折扣因子
lmbda = 0.95 # GAE参数
max_steps = 1000000 # 每个Worker的最大步骤
num_workers = 4 # Worker数量
train_a3c(state_dim, action_dim, hidden_dim, actor_lr, critic_lr, gamma, lmbda, max_steps, num_workers)