Skip to content

Commit 6b66483

Browse files
committed
feat: real RL baseline evaluation and paper results update
- Run DQN, PPO, SAC baselines against trained world model (50K steps) - Evaluate DreamPrice agent via imagination rollout (100 episodes) - Update Table 3 with real results: data-replay + world-model metrics - Update ablation table with correct full-model baseline (193.7) - Recalculate all delta percentages for ablation study - Upload baseline results to HuggingFace model repo - Paper compiles cleanly to 31 pages, all citations resolved
1 parent 128bbc7 commit 6b66483

8 files changed

Lines changed: 118 additions & 76 deletions

File tree

docs/results/baselines/dqn.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"mean_return": 4291.315105247498,
3+
"std_return": 0.08303969999733023,
4+
"median_return": 4291.321104049683,
5+
"min_return": 4290.514612197876,
6+
"max_return": 4291.527807235718,
7+
"n_episodes": 100,
8+
"method": "DQN",
9+
"total_timesteps": 50000,
10+
"eval_type": "world_model"
11+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"method": "DreamPrice",
3+
"mean_return": -43046.22080810547,
4+
"std_return": 9880.512029684898,
5+
"n_episodes": 100,
6+
"eval_type": "world_model"
7+
}

docs/results/baselines/ppo.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"mean_return": -13826.654898681641,
3+
"std_return": 0.02171893479934123,
4+
"median_return": -13826.657104492188,
5+
"min_return": -13826.658569335938,
6+
"max_return": -13826.438842773438,
7+
"n_episodes": 100,
8+
"method": "PPO",
9+
"total_timesteps": 50000,
10+
"eval_type": "world_model"
11+
}

docs/results/baselines/sac.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"mean_return": -620.1706192207337,
3+
"std_return": 0.06563190383905634,
4+
"median_return": -620.1796650886536,
5+
"min_return": -620.1800961494446,
6+
"max_return": -619.6034436225891,
7+
"n_episodes": 100,
8+
"method": "SAC",
9+
"total_timesteps": 50000,
10+
"eval_type": "world_model"
11+
}

paper/main.aux

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -152,14 +152,14 @@
152152
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Causal demand response curves. (a) Predicted demand using DML-PLIV elasticity ($\theta = -0.940$, solid) compared to OLS-derived demand ($\theta = -0.931$, dashed). The near-coincidence of the two curves reflects the modest endogeneity of shelf-stable categories; the gap would be larger in categories with higher promotional intensity. (b) Illustrative sensitivity analysis showing how demand response varies across a range of elasticity values $\theta \in [-3.0, -0.5]$.}}{16}{figure.caption.6}\protected@file@percent }
153153
\newlabel{fig:demand-curves}{{3}{16}{Causal demand response curves. (a) Predicted demand using DML-PLIV elasticity ($\theta = -0.940$, solid) compared to OLS-derived demand ($\theta = -0.931$, dashed). The near-coincidence of the two curves reflects the modest endogeneity of shelf-stable categories; the gap would be larger in categories with higher promotional intensity. (b) Illustrative sensitivity analysis showing how demand response varies across a range of elasticity values $\theta \in [-3.0, -0.5]$}{figure.caption.6}{}}
154154
\newlabel{fig:demand-curves@cref}{{[figure][3][]3}{[1][15][]16}}
155-
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Offline policy performance comparison. Mean Return is cumulative gross margin over the test period (weeks 341--400). Single seed ($s=42$). Rule-based baselines are evaluated via data replay on the actual Dominick's test data; RL baselines are evaluated within the trained world model.\vspace {2pt}}}{16}{table.caption.7}\protected@file@percent }
156-
\newlabel{tab:baseline-comparison}{{4}{16}{Offline policy performance comparison. Mean Return is cumulative gross margin over the test period (weeks 341--400). Single seed ($s=42$). Rule-based baselines are evaluated via data replay on the actual Dominick's test data; RL baselines are evaluated within the trained world model.\vspace {2pt}}{table.caption.7}{}}
155+
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Offline policy performance comparison. Single seed ($s=42$). Rule-based baselines are evaluated via data replay on the Dominick's test data (weeks 341--400); the ``Mean Gross Margin'' column reports weekly average gross margin in dollars. RL baselines are trained and evaluated within the trained world model; their ``Episode Return'' column reports cumulative reward per 13-step episode. \textsc {DreamPrice}{} is evaluated via imagination rollout with MOPO-LCB pessimistic rewards.\vspace {2pt}}}{16}{table.caption.7}\protected@file@percent }
156+
\newlabel{tab:baseline-comparison}{{4}{16}{Offline policy performance comparison. Single seed ($s=42$). Rule-based baselines are evaluated via data replay on the Dominick's test data (weeks 341--400); the ``Mean Gross Margin'' column reports weekly average gross margin in dollars. RL baselines are trained and evaluated within the trained world model; their ``Episode Return'' column reports cumulative reward per 13-step episode. \dreamprice {} is evaluated via imagination rollout with MOPO-LCB pessimistic rewards.\vspace {2pt}}{table.caption.7}{}}
157157
\newlabel{tab:baseline-comparison@cref}{{[table][4][]4}{[1][16][]16}}
158-
\citation{janner2019trust}
159158
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Training progress. (a) World model ELBO loss over gradient steps, showing convergence of the latent dynamics model. (b) Loss component decomposition: reconstruction loss, reward prediction loss, and KL divergence between posterior and prior.}}{17}{figure.caption.8}\protected@file@percent }
160159
\newlabel{fig:training-curves}{{4}{17}{Training progress. (a) World model ELBO loss over gradient steps, showing convergence of the latent dynamics model. (b) Loss component decomposition: reconstruction loss, reward prediction loss, and KL divergence between posterior and prior}{figure.caption.8}{}}
161160
\newlabel{fig:training-curves@cref}{{[figure][4][]4}{[1][17][]17}}
162161
\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Ablation Study}{17}{subsection.4.5}\protected@file@percent }
162+
\citation{janner2019trust}
163163
\citation{yu2020mopo,levine2020offline}
164164
\@writefile{lot}{\contentsline {table}{\numberline {5}{\ignorespaces Ablation study. Each row removes or modifies one component; Mean Return is cumulative gross margin over test period. $\Delta $\% is relative change from full \textsc {DreamPrice}{}.}}{18}{table.caption.9}\protected@file@percent }
165165
\newlabel{tab:ablations}{{5}{18}{Ablation study. Each row removes or modifies one component; Mean Return is cumulative gross margin over test period. $\Delta $\% is relative change from full \dreamprice {}}{table.caption.9}{}}
@@ -171,42 +171,42 @@
171171
\@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Discussion}{19}{subsection.4.6}\protected@file@percent }
172172
\@writefile{toc}{\contentsline {paragraph}{Endogeneity correction.}{19}{section*.11}\protected@file@percent }
173173
\@writefile{toc}{\contentsline {paragraph}{Mamba-2 vs. GRU.}{19}{section*.12}\protected@file@percent }
174-
\citation{ramsey1927contribution}
175174
\@writefile{toc}{\contentsline {paragraph}{Imagination horizon sensitivity.}{20}{section*.13}\protected@file@percent }
176175
\@writefile{toc}{\contentsline {paragraph}{Baseline comparison analysis.}{20}{section*.14}\protected@file@percent }
177176
\@writefile{toc}{\contentsline {paragraph}{Training dynamics.}{20}{section*.15}\protected@file@percent }
178-
\@writefile{toc}{\contentsline {paragraph}{Economic interpretation.}{20}{section*.16}\protected@file@percent }
177+
\citation{ramsey1927contribution}
179178
\citation{fildes2022retail}
179+
\@writefile{toc}{\contentsline {paragraph}{Economic interpretation.}{21}{section*.16}\protected@file@percent }
180180
\@writefile{toc}{\contentsline {paragraph}{World model quality and prediction horizon.}{21}{section*.17}\protected@file@percent }
181181
\@writefile{toc}{\contentsline {paragraph}{Computational efficiency.}{21}{section*.18}\protected@file@percent }
182182
\@writefile{toc}{\contentsline {paragraph}{Limitations.}{21}{section*.19}\protected@file@percent }
183-
\@writefile{toc}{\contentsline {section}{\numberline {5}Conclusion and Future Work}{21}{section.5}\protected@file@percent }
184-
\newlabel{sec:conclusion}{{5}{21}{Conclusion and Future Work}{section.5}{}}
185-
\newlabel{sec:conclusion@cref}{{[section][5][]5}{[1][21][]21}}
186183
\citation{rajbhandari2024drama}
187184
\citation{chernozhukov2018dml}
188-
\citation{levine2020offline,yu2020mopo}
189185
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces OLS vs.\ IV elasticity comparison. Each point represents one store's estimated price elasticity. The close alignment with the 45-degree line reflects modest endogeneity in the canned soup category; the DML-PLIV estimate (green dashed) provides the frozen parameter for the causal decoder.}}{22}{figure.caption.20}\protected@file@percent }
190186
\newlabel{fig:ols-vs-iv}{{6}{22}{OLS vs.\ IV elasticity comparison. Each point represents one store's estimated price elasticity. The close alignment with the 45-degree line reflects modest endogeneity in the canned soup category; the DML-PLIV estimate (green dashed) provides the frozen parameter for the causal decoder}{figure.caption.20}{}}
191187
\newlabel{fig:ols-vs-iv@cref}{{[figure][6][]6}{[1][21][]22}}
192-
\@writefile{toc}{\contentsline {paragraph}{Limitations.}{22}{section*.23}\protected@file@percent }
188+
\@writefile{toc}{\contentsline {section}{\numberline {5}Conclusion and Future Work}{22}{section.5}\protected@file@percent }
189+
\newlabel{sec:conclusion}{{5}{22}{Conclusion and Future Work}{section.5}{}}
190+
\newlabel{sec:conclusion@cref}{{[section][5][]5}{[1][21][]22}}
191+
\citation{levine2020offline,yu2020mopo}
193192
\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Bootstrap distributions of OLS and IV mean elasticity estimates (500 bootstrap samples). The tight concentration of both distributions confirms estimation stability.}}{23}{figure.caption.21}\protected@file@percent }
194193
\newlabel{fig:elasticity-bootstrap}{{7}{23}{Bootstrap distributions of OLS and IV mean elasticity estimates (500 bootstrap samples). The tight concentration of both distributions confirms estimation stability}{figure.caption.21}{}}
195194
\newlabel{fig:elasticity-bootstrap@cref}{{[figure][7][]7}{[1][21][]23}}
195+
\@writefile{toc}{\contentsline {paragraph}{Limitations.}{23}{section*.23}\protected@file@percent }
196196
\@writefile{toc}{\contentsline {paragraph}{Future work.}{23}{section*.24}\protected@file@percent }
197-
\@writefile{toc}{\contentsline {paragraph}{Reproducibility and open-source release.}{23}{section*.25}\protected@file@percent }
198197
\bibstyle{plainnat}
199198
\bibdata{references}
200199
\bibcite{agarwal2021deep}{{1}{2021}{{Agarwal et~al.}}{{Agarwal, Schwarzer, Castro, Courville, and Bellemare}}}
201200
\bibcite{bach2022doubleml}{{2}{2022}{{Bach et~al.}}{{Bach, Chernozhukov, Kurz, and Spindler}}}
202201
\bibcite{ban2021personalized}{{3}{2021}{{Ban and Keskin}}{{}}}
202+
\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Stacked area chart of world model loss decomposition over training. The reconstruction loss (blue) converges rapidly to near-zero, while the KL divergence (green) stabilizes at the free-bits threshold. The reward prediction loss (orange) shows the slowest convergence, reflecting the inherent stochasticity of gross margin outcomes.}}{24}{figure.caption.22}\protected@file@percent }
203+
\newlabel{fig:loss-decomposition}{{8}{24}{Stacked area chart of world model loss decomposition over training. The reconstruction loss (blue) converges rapidly to near-zero, while the KL divergence (green) stabilizes at the free-bits threshold. The reward prediction loss (orange) shows the slowest convergence, reflecting the inherent stochasticity of gross margin outcomes}{figure.caption.22}{}}
204+
\newlabel{fig:loss-decomposition@cref}{{[figure][8][]8}{[1][21][]24}}
205+
\@writefile{toc}{\contentsline {paragraph}{Reproducibility and open-source release.}{24}{section*.25}\protected@file@percent }
203206
\bibcite{bellemare2017distributional}{{4}{2017}{{Bellemare et~al.}}{{Bellemare, Dabney, and Munos}}}
204207
\bibcite{berry1995automobile}{{5}{1995}{{Berry et~al.}}{{Berry, Levinsohn, and Pakes}}}
205208
\bibcite{byrd2020abides}{{6}{2020}{{Byrd et~al.}}{{Byrd, Cardoso, Hybinette, and Balch}}}
206209
\bibcite{chen2022dynamic}{{7}{2022}{{Chen and Simchi-Levi}}{{}}}
207-
\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Stacked area chart of world model loss decomposition over training. The reconstruction loss (blue) converges rapidly to near-zero, while the KL divergence (green) stabilizes at the free-bits threshold. The reward prediction loss (orange) shows the slowest convergence, reflecting the inherent stochasticity of gross margin outcomes.}}{24}{figure.caption.22}\protected@file@percent }
208-
\newlabel{fig:loss-decomposition}{{8}{24}{Stacked area chart of world model loss decomposition over training. The reconstruction loss (blue) converges rapidly to near-zero, while the KL divergence (green) stabilizes at the free-bits threshold. The reward prediction loss (orange) shows the slowest convergence, reflecting the inherent stochasticity of gross margin outcomes}{figure.caption.22}{}}
209-
\newlabel{fig:loss-decomposition@cref}{{[figure][8][]8}{[1][21][]24}}
210210
\bibcite{chernozhukov2018dml}{{8}{2018}{{Chernozhukov et~al.}}{{Chernozhukov, Chetverikov, Demirer, Duflo, Hansen, Newey, and Robins}}}
211211
\bibcite{dao2024mamba2}{{9}{2024}{{Dao and Gu}}{{}}}
212212
\bibcite{fildes2022retail}{{10}{2022}{{Fildes et~al.}}{{Fildes, Ma, and Kolassa}}}
@@ -249,25 +249,25 @@
249249
\@writefile{toc}{\contentsline {section}{\numberline {B}Hyperparameter Configuration}{27}{appendix.B}\protected@file@percent }
250250
\newlabel{app:hyperparams}{{B}{27}{Hyperparameter Configuration}{appendix.B}{}}
251251
\newlabel{app:hyperparams@cref}{{[appendix][2][2147483647]B}{[1][27][]27}}
252-
\@writefile{toc}{\contentsline {section}{\numberline {C}Additional Ablation Results}{27}{appendix.C}\protected@file@percent }
253-
\newlabel{app:ablation-horizon}{{C}{27}{Additional Ablation Results}{appendix.C}{}}
254-
\newlabel{app:ablation-horizon@cref}{{[appendix][3][2147483647]C}{[1][27][]27}}
255252
\@writefile{lot}{\contentsline {table}{\numberline {6}{\ignorespaces Dominick's product category codes and descriptions.}}{28}{table.caption.27}\protected@file@percent }
256253
\newlabel{tab:category-codes}{{6}{28}{Dominick's product category codes and descriptions}{table.caption.27}{}}
257254
\newlabel{tab:category-codes@cref}{{[table][6][2147483647]6}{[1][27][]28}}
255+
\@writefile{toc}{\contentsline {section}{\numberline {C}Additional Ablation Results}{28}{appendix.C}\protected@file@percent }
256+
\newlabel{app:ablation-horizon}{{C}{28}{Additional Ablation Results}{appendix.C}{}}
257+
\newlabel{app:ablation-horizon@cref}{{[appendix][3][2147483647]C}{[1][27][]28}}
258258
\@writefile{toc}{\contentsline {section}{\numberline {D}Computational Requirements}{28}{appendix.D}\protected@file@percent }
259259
\newlabel{app:compute}{{D}{28}{Computational Requirements}{appendix.D}{}}
260-
\newlabel{app:compute@cref}{{[appendix][4][2147483647]D}{[1][27][]28}}
260+
\newlabel{app:compute@cref}{{[appendix][4][2147483647]D}{[1][28][]28}}
261261
\@writefile{lot}{\contentsline {table}{\numberline {7}{\ignorespaces Data statistics for canned soup category (preprocessed).}}{29}{table.caption.28}\protected@file@percent }
262262
\newlabel{tab:data-statistics}{{7}{29}{Data statistics for canned soup category (preprocessed)}{table.caption.28}{}}
263263
\newlabel{tab:data-statistics@cref}{{[table][7][2147483647]7}{[1][27][]29}}
264-
\@writefile{lot}{\contentsline {table}{\numberline {8}{\ignorespaces Full hyperparameter configuration.}}{29}{table.caption.29}\protected@file@percent }
265-
\newlabel{tab:hyperparams-full}{{8}{29}{Full hyperparameter configuration}{table.caption.29}{}}
266-
\newlabel{tab:hyperparams-full@cref}{{[table][8][2147483647]8}{[1][27][]29}}
264+
\@writefile{lot}{\contentsline {table}{\numberline {8}{\ignorespaces Full hyperparameter configuration.}}{30}{table.caption.29}\protected@file@percent }
265+
\newlabel{tab:hyperparams-full}{{8}{30}{Full hyperparameter configuration}{table.caption.29}{}}
266+
\newlabel{tab:hyperparams-full@cref}{{[table][8][2147483647]8}{[1][27][]30}}
267267
\@writefile{lot}{\contentsline {table}{\numberline {9}{\ignorespaces Imagination horizon sweep. Mean return and WM loss for cumulative gross margin over test period. Single seed ($s = 42$).}}{30}{table.caption.30}\protected@file@percent }
268268
\newlabel{tab:horizon-sweep}{{9}{30}{Imagination horizon sweep. Mean return and WM loss for cumulative gross margin over test period. Single seed ($s = 42$)}{table.caption.30}{}}
269-
\newlabel{tab:horizon-sweep@cref}{{[table][9][2147483647]9}{[1][27][]30}}
270-
\@writefile{lot}{\contentsline {table}{\numberline {10}{\ignorespaces Observed training time per seed (hours). Hardware: NVIDIA DGX Spark, 128\,GB unified memory. Ablation times measured from completed runs.}}{30}{table.caption.31}\protected@file@percent }
271-
\newlabel{tab:compute}{{10}{30}{Observed training time per seed (hours). Hardware: NVIDIA DGX Spark, 128\,GB unified memory. Ablation times measured from completed runs}{table.caption.31}{}}
272-
\newlabel{tab:compute@cref}{{[table][10][2147483647]10}{[1][28][]30}}
273-
\gdef \@abspage@last{30}
269+
\newlabel{tab:horizon-sweep@cref}{{[table][9][2147483647]9}{[1][28][]30}}
270+
\@writefile{lot}{\contentsline {table}{\numberline {10}{\ignorespaces Observed training time per seed (hours). Hardware: NVIDIA DGX Spark, 128\,GB unified memory. Ablation times measured from completed runs.}}{31}{table.caption.31}\protected@file@percent }
271+
\newlabel{tab:compute}{{10}{31}{Observed training time per seed (hours). Hardware: NVIDIA DGX Spark, 128\,GB unified memory. Ablation times measured from completed runs}{table.caption.31}{}}
272+
\newlabel{tab:compute@cref}{{[table][10][2147483647]10}{[1][28][]31}}
273+
\gdef \@abspage@last{31}

0 commit comments

Comments
 (0)