feat: real baseline evaluation and synthetic figure removal

SharathSPhD · SharathSPhD · commit 27390f5a660c · 2026-02-25T17:01:33.000Z
- Rewrite cost_plus, competitive_matching, static_xgboost to evaluate
  on actual Dominick's test data (weeks 341-400) via data-replay
- Add run_rl_baselines.py for DQN/PPO/SAC training against the trained
  world model environment
- Remove synthetic figures 10-12 (policy heatmap, reward distribution,
  imagination rollout) that used np.random data presented as real results
- Remove duplicate fig_wandb_dashboard from generate_figures.py
- Save real baseline results to docs/results/baselines/
diff --git a/docs/results/baselines/competitive_matching.json b/docs/results/baselines/competitive_matching.json
@@ -0,0 +1,9 @@
+{
+  "method": "competitive_matching",
+  "noise_pct": 0.02,
+  "total_gross_margin": 2296041.001669776,
+  "mean_return": 45020.41179744659,
+  "n_rows": 483848,
+  "n_weeks": 51,
+  "eval_type": "data_replay"
+}
diff --git a/docs/results/baselines/cost_plus.json b/docs/results/baselines/cost_plus.json
@@ -0,0 +1,9 @@
+{
+  "method": "cost_plus",
+  "markup": 0.25,
+  "total_gross_margin": 1718201.6170062497,
+  "mean_return": 33690.22778443627,
+  "n_rows": 483848,
+  "n_weeks": 51,
+  "eval_type": "data_replay"
+}
diff --git a/docs/results/baselines/static_xgboost.json b/docs/results/baselines/static_xgboost.json
@@ -0,0 +1,10 @@
+{
+  "method": "static_xgboost",
+  "total_gross_margin": 7579751.039079369,
+  "mean_return": 148622.5693937131,
+  "train_rows": 4088234,
+  "test_rows": 483848,
+  "sampled_rows": 5000,
+  "n_weeks": 51,
+  "eval_type": "data_replay"
+}
diff --git a/paper/sections/results.tex b/paper/sections/results.tex
@@ -201,23 +201,3 @@ \subsection{Discussion}
 \label{fig:loss-decomposition}
 \end{figure}
 
-\begin{figure}[t]
-\centering
-\includegraphics[width=0.95\textwidth]{figures/policy_heatmap.pdf}
-\caption{Learned pricing policy heatmap showing recommended price adjustments (relative to baseline) across stores and test weeks. The policy exhibits seasonal variation (approximately 52-week periodicity) and store-level heterogeneity reflecting local demand conditions.}
-\label{fig:policy-heatmap}
-\end{figure}
-
-\begin{figure}[t]
-\centering
-\includegraphics[width=0.95\textwidth]{figures/reward_distribution.pdf}
-\caption{Reward prediction quality. (a) Distribution of actual vs.\ predicted gross margin rewards across evaluation episodes. (b) Scatter plot of actual vs.\ predicted rewards with linear fit, demonstrating the world model's ability to predict reward magnitudes.}
-\label{fig:reward-distribution}
-\end{figure}
-
-\begin{figure}[t]
-\centering
-\includegraphics[width=0.95\textwidth]{figures/imagination_rollout.pdf}
-\caption{Imagination rollout trajectories for four representative SKUs over 13 steps. Blue circles show actual demand; orange squares show imagined demand from the world model. Shaded bands represent $\pm 2\sigma$ uncertainty from the stochastic latent. Error grows sub-linearly with horizon, consistent with NDR analysis.}
-\label{fig:imagination-rollout}
-\end{figure}
diff --git a/scripts/baselines/competitive_matching.py b/scripts/baselines/competitive_matching.py
@@ -1,97 +1,87 @@
-"""Competitive matching baseline.
+"""Competitive matching baseline evaluated on Dominick's test data.
 
-Set price = competitor_median_price +/- 5% noise.
-Since Dominick's lacks time-varying competitor data, we use
-category average price as proxy for competitor median.
+Sets each SKU's price to the category-average price (± 2% noise) and
+computes gross margin using actual units sold in the test period.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
+from pathlib import Path
 
 import numpy as np
+import pandas as pd
 
-try:
-    import wandb
 
-    HAS_WANDB = True
-except ImportError:
-    HAS_WANDB = False
+def load_test_data(data_dir: Path) -> pd.DataFrame:
+    """Load and filter Dominick's CSO data to test weeks."""
+    df = pd.read_csv(
+        data_dir / "category" / "wcso.csv",
+        usecols=["STORE", "UPC", "WEEK", "MOVE", "QTY", "PRICE", "PROFIT", "OK"],
+    )
+    df = df[(df["OK"] == 1) & (df["PRICE"] > 0)]
+    df["unit_price"] = df["PRICE"] / df["QTY"]
+    df["cost"] = df["PRICE"] * (1 - df["PROFIT"] / 100) / df["QTY"]
+    test = df[df["WEEK"] >= 341].copy()
+    return test
 
 
 def run_competitive_matching(
-    category_avg_prices: np.ndarray,
-    cost_vector: np.ndarray,
-    demand_fn,
-    noise_pct: float = 0.05,
-    n_episodes: int = 10,
-    H: int = 13,
+    test_df: pd.DataFrame,
+    noise_pct: float = 0.02,
     seed: int = 42,
-) -> dict[str, float]:
-    """Run competitive matching baseline.
-
-    Args:
-        category_avg_prices: (n_skus,) average category prices as competitor proxy.
-        cost_vector: (n_skus,) per-SKU cost.
-        demand_fn: Callable(prices) -> units_sold.
-        noise_pct: Price noise as fraction of category average (default 5%).
-        n_episodes: Number of evaluation episodes.
-        H: Steps per episode.
-        seed: Random seed.
-    """
+) -> dict:
+    """Match category-average price with small noise."""
     rng = np.random.default_rng(seed)
+    df = test_df.copy()
+
+    cat_avg = df.groupby("UPC")["unit_price"].transform("mean")
+    noise = rng.uniform(-noise_pct, noise_pct, len(df))
+    df["proposed_price"] = cat_avg * (1 + noise)
+    df["proposed_price"] = np.maximum(df["proposed_price"], df["cost"] * 1.01)
+    df["gross_margin"] = (df["proposed_price"] - df["cost"]) * df["MOVE"]
 
-    total_profit = 0.0
-    for ep in range(n_episodes):
-        ep_profit = 0.0
-        for step in range(H):
-            noise = rng.uniform(-noise_pct, noise_pct, len(category_avg_prices))
-            prices = category_avg_prices * (1 + noise)
-            prices = np.clip(prices, cost_vector * 1.01, None)  # ensure above cost
-            units_sold = demand_fn(prices)
-            profit = ((prices - cost_vector) * units_sold).sum()
-            ep_profit += profit
-        total_profit += ep_profit
-
-    avg_profit = total_profit / n_episodes
+    total_margin = df["gross_margin"].sum()
+    n_weeks = df["WEEK"].nunique()
     return {
-        "avg_episode_profit": float(avg_profit),
-        "profit_per_step": float(avg_profit / H),
+        "method": "competitive_matching",
         "noise_pct": noise_pct,
+        "total_gross_margin": float(total_margin),
+        "mean_return": float(total_margin / n_weeks),
+        "n_rows": len(df),
+        "n_weeks": int(n_weeks),
+        "eval_type": "data_replay",
     }
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Competitive matching baseline")
+    parser.add_argument(
+        "--data-dir",
+        type=Path,
+        default=Path("/workspace/docs/data"),
+    )
+    parser.add_argument("--noise-pct", type=float, default=0.02)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--n-episodes", type=int, default=10)
-    parser.add_argument("--n-skus", type=int, default=25)
-    parser.add_argument("--use-wandb", action="store_true")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("/workspace/docs/results/baselines/competitive_matching.json"),
+    )
     args = parser.parse_args()
 
-    rng = np.random.default_rng(args.seed)
-    cost_vector = rng.uniform(0.50, 3.00, args.n_skus).astype(np.float32)
-    category_avg = cost_vector * 1.25  # assume 25% markup is category average
+    print("Loading Dominick's CSO test data (weeks 341-400)...")
+    test_df = load_test_data(args.data_dir)
+    print(f"  {len(test_df)} rows, {test_df['WEEK'].nunique()} weeks, {test_df['UPC'].nunique()} UPCs")
 
-    def demand_fn(prices: np.ndarray) -> np.ndarray:
-        base = 100.0
-        return np.clip(base * np.exp(-2.5 * np.log(np.clip(prices, 0.01, None))), 0, 10000)
-
-    if args.use_wandb and HAS_WANDB:
-        wandb.init(project="dreamprice", group="baselines", name="competitive-matching")
-
-    metrics = run_competitive_matching(
-        category_avg,
-        cost_vector,
-        demand_fn,
-        n_episodes=args.n_episodes,
-        seed=args.seed,
-    )
-    print(f"Competitive matching: profit/step={metrics['profit_per_step']:.2f}")
+    results = run_competitive_matching(test_df, args.noise_pct, args.seed)
+    print(f"Competitive matching: mean return = {results['mean_return']:.2f}")
 
-    if args.use_wandb and HAS_WANDB:
-        wandb.log(metrics)
-        wandb.finish()
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Saved to {args.output}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/baselines/cost_plus.py b/scripts/baselines/cost_plus.py
@@ -1,107 +1,76 @@
-"""Cost-plus fixed markup baseline.
+"""Cost-plus fixed markup baseline evaluated on Dominick's test data.
 
-price = cost * (1 + target_margin)
-Sweep margins {0.15, 0.20, 0.25, 0.30}.
+Applies a fixed markup over wholesale cost to each SKU-week in the test period
+(weeks 341-400). Returns the cumulative gross margin.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
+from pathlib import Path
 
 import numpy as np
-
-try:
-    import wandb
-
-    HAS_WANDB = True
-except ImportError:
-    HAS_WANDB = False
-
-
-def run_cost_plus(
-    cost_vector: np.ndarray,
-    demand_fn,
-    target_margin: float,
-    n_episodes: int = 10,
-    H: int = 13,
-    seed: int = 42,
-) -> dict[str, float]:
-    """Run cost-plus baseline for multiple episodes.
-
-    Args:
-        cost_vector: (n_skus,) per-SKU cost.
-        demand_fn: Callable(prices) -> units_sold.
-        target_margin: Markup fraction (e.g. 0.25 = 25%).
-        n_episodes: Number of episodes to evaluate.
-        H: Steps per episode.
-        seed: Random seed.
-
-    Returns:
-        Dict of metrics.
-    """
-    prices = cost_vector * (1 + target_margin)
-
-    total_profit = 0.0
-    total_revenue = 0.0
-
-    for ep in range(n_episodes):
-        ep_profit = 0.0
-        for step in range(H):
-            units_sold = demand_fn(prices)
-            profit = ((prices - cost_vector) * units_sold).sum()
-            ep_profit += profit
-            total_revenue += (prices * units_sold).sum()
-        total_profit += ep_profit
-
-    avg_profit = total_profit / n_episodes
-    avg_revenue = total_revenue / n_episodes
-
+import pandas as pd
+
+
+def load_test_data(data_dir: Path) -> pd.DataFrame:
+    """Load and filter Dominick's CSO data to test weeks."""
+    df = pd.read_csv(
+        data_dir / "category" / "wcso.csv",
+        usecols=["STORE", "UPC", "WEEK", "MOVE", "QTY", "PRICE", "PROFIT", "OK"],
+    )
+    df = df[(df["OK"] == 1) & (df["PRICE"] > 0)]
+    df["unit_price"] = df["PRICE"] / df["QTY"]
+    df["cost"] = df["PRICE"] * (1 - df["PROFIT"] / 100) / df["QTY"]
+    test = df[df["WEEK"] >= 341].copy()
+    return test
+
+
+def run_cost_plus(test_df: pd.DataFrame, markup: float) -> dict:
+    """Apply fixed markup and compute gross margin using actual units sold."""
+    df = test_df.copy()
+    df["proposed_price"] = df["cost"] * (1 + markup)
+    df["gross_margin"] = (df["proposed_price"] - df["cost"]) * df["MOVE"]
+    total_margin = df["gross_margin"].sum()
+    n_weeks = df["WEEK"].nunique()
     return {
-        "target_margin": target_margin,
-        "avg_episode_profit": float(avg_profit),
-        "avg_episode_revenue": float(avg_revenue),
-        "profit_per_step": float(avg_profit / H),
+        "method": "cost_plus",
+        "markup": markup,
+        "total_gross_margin": float(total_margin),
+        "mean_return": float(total_margin / n_weeks),
+        "n_rows": len(df),
+        "n_weeks": int(n_weeks),
+        "eval_type": "data_replay",
     }
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Cost-plus baseline")
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--n-episodes", type=int, default=10)
-    parser.add_argument("--n-skus", type=int, default=25)
-    parser.add_argument("--H", type=int, default=13)
-    parser.add_argument("--use-wandb", action="store_true")
+    parser = argparse.ArgumentParser(description="Cost-plus baseline on Dominick's data")
+    parser.add_argument(
+        "--data-dir",
+        type=Path,
+        default=Path("/workspace/docs/data"),
+    )
+    parser.add_argument("--markup", type=float, default=0.25)
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("/workspace/docs/results/baselines/cost_plus.json"),
+    )
     args = parser.parse_args()
 
-    np.random.seed(args.seed)
-
-    # Synthetic cost and demand for standalone testing
-    cost_vector = np.random.uniform(0.50, 3.00, size=args.n_skus).astype(np.float32)
-
-    def demand_fn(prices: np.ndarray) -> np.ndarray:
-        """Simple log-linear demand with elasticity ~ -2.5."""
-        base = 100.0
-        return np.clip(base * np.exp(-2.5 * np.log(np.clip(prices, 0.01, None))), 0, 10000)
-
-    if args.use_wandb and HAS_WANDB:
-        wandb.init(project="dreamprice", group="baselines", name="cost-plus")
+    print("Loading Dominick's CSO test data (weeks 341-400)...")
+    test_df = load_test_data(args.data_dir)
+    print(f"  {len(test_df)} rows, {test_df['WEEK'].nunique()} weeks, {test_df['UPC'].nunique()} UPCs")
 
-    margins = [0.15, 0.20, 0.25, 0.30]
-    for margin in margins:
-        metrics = run_cost_plus(
-            cost_vector,
-            demand_fn,
-            margin,
-            n_episodes=args.n_episodes,
-            H=args.H,
-            seed=args.seed,
-        )
-        print(f"Margin {margin:.0%}: profit/step={metrics['profit_per_step']:.2f}")
-        if args.use_wandb and HAS_WANDB:
-            wandb.log(metrics)
+    results = run_cost_plus(test_df, args.markup)
+    print(f"Cost-plus ({args.markup:.0%}): mean return = {results['mean_return']:.2f}")
 
-    if args.use_wandb and HAS_WANDB:
-        wandb.finish()
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Saved to {args.output}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/baselines/run_rl_baselines.py b/scripts/baselines/run_rl_baselines.py
diff --git a/scripts/baselines/static_xgboost.py b/scripts/baselines/static_xgboost.py
diff --git a/scripts/generate_figures.py b/scripts/generate_figures.py