Skip to content

Commit 27390f5

Browse files
committed
feat: real baseline evaluation and synthetic figure removal
- Rewrite cost_plus, competitive_matching, static_xgboost to evaluate on actual Dominick's test data (weeks 341-400) via data-replay - Add run_rl_baselines.py for DQN/PPO/SAC training against the trained world model environment - Remove synthetic figures 10-12 (policy heatmap, reward distribution, imagination rollout) that used np.random data presented as real results - Remove duplicate fig_wandb_dashboard from generate_figures.py - Save real baseline results to docs/results/baselines/
1 parent 2fdcb63 commit 27390f5

9 files changed

Lines changed: 530 additions & 293 deletions

File tree

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"method": "competitive_matching",
3+
"noise_pct": 0.02,
4+
"total_gross_margin": 2296041.001669776,
5+
"mean_return": 45020.41179744659,
6+
"n_rows": 483848,
7+
"n_weeks": 51,
8+
"eval_type": "data_replay"
9+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"method": "cost_plus",
3+
"markup": 0.25,
4+
"total_gross_margin": 1718201.6170062497,
5+
"mean_return": 33690.22778443627,
6+
"n_rows": 483848,
7+
"n_weeks": 51,
8+
"eval_type": "data_replay"
9+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"method": "static_xgboost",
3+
"total_gross_margin": 7579751.039079369,
4+
"mean_return": 148622.5693937131,
5+
"train_rows": 4088234,
6+
"test_rows": 483848,
7+
"sampled_rows": 5000,
8+
"n_weeks": 51,
9+
"eval_type": "data_replay"
10+
}

paper/sections/results.tex

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -201,23 +201,3 @@ \subsection{Discussion}
201201
\label{fig:loss-decomposition}
202202
\end{figure}
203203

204-
\begin{figure}[t]
205-
\centering
206-
\includegraphics[width=0.95\textwidth]{figures/policy_heatmap.pdf}
207-
\caption{Learned pricing policy heatmap showing recommended price adjustments (relative to baseline) across stores and test weeks. The policy exhibits seasonal variation (approximately 52-week periodicity) and store-level heterogeneity reflecting local demand conditions.}
208-
\label{fig:policy-heatmap}
209-
\end{figure}
210-
211-
\begin{figure}[t]
212-
\centering
213-
\includegraphics[width=0.95\textwidth]{figures/reward_distribution.pdf}
214-
\caption{Reward prediction quality. (a) Distribution of actual vs.\ predicted gross margin rewards across evaluation episodes. (b) Scatter plot of actual vs.\ predicted rewards with linear fit, demonstrating the world model's ability to predict reward magnitudes.}
215-
\label{fig:reward-distribution}
216-
\end{figure}
217-
218-
\begin{figure}[t]
219-
\centering
220-
\includegraphics[width=0.95\textwidth]{figures/imagination_rollout.pdf}
221-
\caption{Imagination rollout trajectories for four representative SKUs over 13 steps. Blue circles show actual demand; orange squares show imagined demand from the world model. Shaded bands represent $\pm 2\sigma$ uncertainty from the stochastic latent. Error grows sub-linearly with horizon, consistent with NDR analysis.}
222-
\label{fig:imagination-rollout}
223-
\end{figure}

scripts/baselines/competitive_matching.py

Lines changed: 56 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,97 +1,87 @@
1-
"""Competitive matching baseline.
1+
"""Competitive matching baseline evaluated on Dominick's test data.
22
3-
Set price = competitor_median_price +/- 5% noise.
4-
Since Dominick's lacks time-varying competitor data, we use
5-
category average price as proxy for competitor median.
3+
Sets each SKU's price to the category-average price (± 2% noise) and
4+
computes gross margin using actual units sold in the test period.
65
"""
76

87
from __future__ import annotations
98

109
import argparse
10+
import json
11+
from pathlib import Path
1112

1213
import numpy as np
14+
import pandas as pd
1315

14-
try:
15-
import wandb
1616

17-
HAS_WANDB = True
18-
except ImportError:
19-
HAS_WANDB = False
17+
def load_test_data(data_dir: Path) -> pd.DataFrame:
18+
"""Load and filter Dominick's CSO data to test weeks."""
19+
df = pd.read_csv(
20+
data_dir / "category" / "wcso.csv",
21+
usecols=["STORE", "UPC", "WEEK", "MOVE", "QTY", "PRICE", "PROFIT", "OK"],
22+
)
23+
df = df[(df["OK"] == 1) & (df["PRICE"] > 0)]
24+
df["unit_price"] = df["PRICE"] / df["QTY"]
25+
df["cost"] = df["PRICE"] * (1 - df["PROFIT"] / 100) / df["QTY"]
26+
test = df[df["WEEK"] >= 341].copy()
27+
return test
2028

2129

2230
def run_competitive_matching(
23-
category_avg_prices: np.ndarray,
24-
cost_vector: np.ndarray,
25-
demand_fn,
26-
noise_pct: float = 0.05,
27-
n_episodes: int = 10,
28-
H: int = 13,
31+
test_df: pd.DataFrame,
32+
noise_pct: float = 0.02,
2933
seed: int = 42,
30-
) -> dict[str, float]:
31-
"""Run competitive matching baseline.
32-
33-
Args:
34-
category_avg_prices: (n_skus,) average category prices as competitor proxy.
35-
cost_vector: (n_skus,) per-SKU cost.
36-
demand_fn: Callable(prices) -> units_sold.
37-
noise_pct: Price noise as fraction of category average (default 5%).
38-
n_episodes: Number of evaluation episodes.
39-
H: Steps per episode.
40-
seed: Random seed.
41-
"""
34+
) -> dict:
35+
"""Match category-average price with small noise."""
4236
rng = np.random.default_rng(seed)
37+
df = test_df.copy()
38+
39+
cat_avg = df.groupby("UPC")["unit_price"].transform("mean")
40+
noise = rng.uniform(-noise_pct, noise_pct, len(df))
41+
df["proposed_price"] = cat_avg * (1 + noise)
42+
df["proposed_price"] = np.maximum(df["proposed_price"], df["cost"] * 1.01)
43+
df["gross_margin"] = (df["proposed_price"] - df["cost"]) * df["MOVE"]
4344

44-
total_profit = 0.0
45-
for ep in range(n_episodes):
46-
ep_profit = 0.0
47-
for step in range(H):
48-
noise = rng.uniform(-noise_pct, noise_pct, len(category_avg_prices))
49-
prices = category_avg_prices * (1 + noise)
50-
prices = np.clip(prices, cost_vector * 1.01, None) # ensure above cost
51-
units_sold = demand_fn(prices)
52-
profit = ((prices - cost_vector) * units_sold).sum()
53-
ep_profit += profit
54-
total_profit += ep_profit
55-
56-
avg_profit = total_profit / n_episodes
45+
total_margin = df["gross_margin"].sum()
46+
n_weeks = df["WEEK"].nunique()
5747
return {
58-
"avg_episode_profit": float(avg_profit),
59-
"profit_per_step": float(avg_profit / H),
48+
"method": "competitive_matching",
6049
"noise_pct": noise_pct,
50+
"total_gross_margin": float(total_margin),
51+
"mean_return": float(total_margin / n_weeks),
52+
"n_rows": len(df),
53+
"n_weeks": int(n_weeks),
54+
"eval_type": "data_replay",
6155
}
6256

6357

6458
def main() -> None:
6559
parser = argparse.ArgumentParser(description="Competitive matching baseline")
60+
parser.add_argument(
61+
"--data-dir",
62+
type=Path,
63+
default=Path("/workspace/docs/data"),
64+
)
65+
parser.add_argument("--noise-pct", type=float, default=0.02)
6666
parser.add_argument("--seed", type=int, default=42)
67-
parser.add_argument("--n-episodes", type=int, default=10)
68-
parser.add_argument("--n-skus", type=int, default=25)
69-
parser.add_argument("--use-wandb", action="store_true")
67+
parser.add_argument(
68+
"--output",
69+
type=Path,
70+
default=Path("/workspace/docs/results/baselines/competitive_matching.json"),
71+
)
7072
args = parser.parse_args()
7173

72-
rng = np.random.default_rng(args.seed)
73-
cost_vector = rng.uniform(0.50, 3.00, args.n_skus).astype(np.float32)
74-
category_avg = cost_vector * 1.25 # assume 25% markup is category average
74+
print("Loading Dominick's CSO test data (weeks 341-400)...")
75+
test_df = load_test_data(args.data_dir)
76+
print(f" {len(test_df)} rows, {test_df['WEEK'].nunique()} weeks, {test_df['UPC'].nunique()} UPCs")
7577

76-
def demand_fn(prices: np.ndarray) -> np.ndarray:
77-
base = 100.0
78-
return np.clip(base * np.exp(-2.5 * np.log(np.clip(prices, 0.01, None))), 0, 10000)
79-
80-
if args.use_wandb and HAS_WANDB:
81-
wandb.init(project="dreamprice", group="baselines", name="competitive-matching")
82-
83-
metrics = run_competitive_matching(
84-
category_avg,
85-
cost_vector,
86-
demand_fn,
87-
n_episodes=args.n_episodes,
88-
seed=args.seed,
89-
)
90-
print(f"Competitive matching: profit/step={metrics['profit_per_step']:.2f}")
78+
results = run_competitive_matching(test_df, args.noise_pct, args.seed)
79+
print(f"Competitive matching: mean return = {results['mean_return']:.2f}")
9180

92-
if args.use_wandb and HAS_WANDB:
93-
wandb.log(metrics)
94-
wandb.finish()
81+
args.output.parent.mkdir(parents=True, exist_ok=True)
82+
with open(args.output, "w") as f:
83+
json.dump(results, f, indent=2)
84+
print(f"Saved to {args.output}")
9585

9686

9787
if __name__ == "__main__":

scripts/baselines/cost_plus.py

Lines changed: 56 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,76 @@
1-
"""Cost-plus fixed markup baseline.
1+
"""Cost-plus fixed markup baseline evaluated on Dominick's test data.
22
3-
price = cost * (1 + target_margin)
4-
Sweep margins {0.15, 0.20, 0.25, 0.30}.
3+
Applies a fixed markup over wholesale cost to each SKU-week in the test period
4+
(weeks 341-400). Returns the cumulative gross margin.
55
"""
66

77
from __future__ import annotations
88

99
import argparse
10+
import json
11+
from pathlib import Path
1012

1113
import numpy as np
12-
13-
try:
14-
import wandb
15-
16-
HAS_WANDB = True
17-
except ImportError:
18-
HAS_WANDB = False
19-
20-
21-
def run_cost_plus(
22-
cost_vector: np.ndarray,
23-
demand_fn,
24-
target_margin: float,
25-
n_episodes: int = 10,
26-
H: int = 13,
27-
seed: int = 42,
28-
) -> dict[str, float]:
29-
"""Run cost-plus baseline for multiple episodes.
30-
31-
Args:
32-
cost_vector: (n_skus,) per-SKU cost.
33-
demand_fn: Callable(prices) -> units_sold.
34-
target_margin: Markup fraction (e.g. 0.25 = 25%).
35-
n_episodes: Number of episodes to evaluate.
36-
H: Steps per episode.
37-
seed: Random seed.
38-
39-
Returns:
40-
Dict of metrics.
41-
"""
42-
prices = cost_vector * (1 + target_margin)
43-
44-
total_profit = 0.0
45-
total_revenue = 0.0
46-
47-
for ep in range(n_episodes):
48-
ep_profit = 0.0
49-
for step in range(H):
50-
units_sold = demand_fn(prices)
51-
profit = ((prices - cost_vector) * units_sold).sum()
52-
ep_profit += profit
53-
total_revenue += (prices * units_sold).sum()
54-
total_profit += ep_profit
55-
56-
avg_profit = total_profit / n_episodes
57-
avg_revenue = total_revenue / n_episodes
58-
14+
import pandas as pd
15+
16+
17+
def load_test_data(data_dir: Path) -> pd.DataFrame:
18+
"""Load and filter Dominick's CSO data to test weeks."""
19+
df = pd.read_csv(
20+
data_dir / "category" / "wcso.csv",
21+
usecols=["STORE", "UPC", "WEEK", "MOVE", "QTY", "PRICE", "PROFIT", "OK"],
22+
)
23+
df = df[(df["OK"] == 1) & (df["PRICE"] > 0)]
24+
df["unit_price"] = df["PRICE"] / df["QTY"]
25+
df["cost"] = df["PRICE"] * (1 - df["PROFIT"] / 100) / df["QTY"]
26+
test = df[df["WEEK"] >= 341].copy()
27+
return test
28+
29+
30+
def run_cost_plus(test_df: pd.DataFrame, markup: float) -> dict:
31+
"""Apply fixed markup and compute gross margin using actual units sold."""
32+
df = test_df.copy()
33+
df["proposed_price"] = df["cost"] * (1 + markup)
34+
df["gross_margin"] = (df["proposed_price"] - df["cost"]) * df["MOVE"]
35+
total_margin = df["gross_margin"].sum()
36+
n_weeks = df["WEEK"].nunique()
5937
return {
60-
"target_margin": target_margin,
61-
"avg_episode_profit": float(avg_profit),
62-
"avg_episode_revenue": float(avg_revenue),
63-
"profit_per_step": float(avg_profit / H),
38+
"method": "cost_plus",
39+
"markup": markup,
40+
"total_gross_margin": float(total_margin),
41+
"mean_return": float(total_margin / n_weeks),
42+
"n_rows": len(df),
43+
"n_weeks": int(n_weeks),
44+
"eval_type": "data_replay",
6445
}
6546

6647

6748
def main() -> None:
68-
parser = argparse.ArgumentParser(description="Cost-plus baseline")
69-
parser.add_argument("--seed", type=int, default=42)
70-
parser.add_argument("--n-episodes", type=int, default=10)
71-
parser.add_argument("--n-skus", type=int, default=25)
72-
parser.add_argument("--H", type=int, default=13)
73-
parser.add_argument("--use-wandb", action="store_true")
49+
parser = argparse.ArgumentParser(description="Cost-plus baseline on Dominick's data")
50+
parser.add_argument(
51+
"--data-dir",
52+
type=Path,
53+
default=Path("/workspace/docs/data"),
54+
)
55+
parser.add_argument("--markup", type=float, default=0.25)
56+
parser.add_argument(
57+
"--output",
58+
type=Path,
59+
default=Path("/workspace/docs/results/baselines/cost_plus.json"),
60+
)
7461
args = parser.parse_args()
7562

76-
np.random.seed(args.seed)
77-
78-
# Synthetic cost and demand for standalone testing
79-
cost_vector = np.random.uniform(0.50, 3.00, size=args.n_skus).astype(np.float32)
80-
81-
def demand_fn(prices: np.ndarray) -> np.ndarray:
82-
"""Simple log-linear demand with elasticity ~ -2.5."""
83-
base = 100.0
84-
return np.clip(base * np.exp(-2.5 * np.log(np.clip(prices, 0.01, None))), 0, 10000)
85-
86-
if args.use_wandb and HAS_WANDB:
87-
wandb.init(project="dreamprice", group="baselines", name="cost-plus")
63+
print("Loading Dominick's CSO test data (weeks 341-400)...")
64+
test_df = load_test_data(args.data_dir)
65+
print(f" {len(test_df)} rows, {test_df['WEEK'].nunique()} weeks, {test_df['UPC'].nunique()} UPCs")
8866

89-
margins = [0.15, 0.20, 0.25, 0.30]
90-
for margin in margins:
91-
metrics = run_cost_plus(
92-
cost_vector,
93-
demand_fn,
94-
margin,
95-
n_episodes=args.n_episodes,
96-
H=args.H,
97-
seed=args.seed,
98-
)
99-
print(f"Margin {margin:.0%}: profit/step={metrics['profit_per_step']:.2f}")
100-
if args.use_wandb and HAS_WANDB:
101-
wandb.log(metrics)
67+
results = run_cost_plus(test_df, args.markup)
68+
print(f"Cost-plus ({args.markup:.0%}): mean return = {results['mean_return']:.2f}")
10269

103-
if args.use_wandb and HAS_WANDB:
104-
wandb.finish()
70+
args.output.parent.mkdir(parents=True, exist_ok=True)
71+
with open(args.output, "w") as f:
72+
json.dump(results, f, indent=2)
73+
print(f"Saved to {args.output}")
10574

10675

10776
if __name__ == "__main__":

0 commit comments

Comments
 (0)