Add WooBench app-charge payment scenarios

Shaw · Shaw · commit b5230ae636e3 · 2026-05-09T17:47:40.000-07:00
diff --git a/packages/app-core/src/benchmark/plugin.ts b/packages/app-core/src/benchmark/plugin.ts
@@ -157,10 +157,11 @@ function formatContextAsText(ctx: BenchmarkContext): string {
 
   if (isWooBench && ctx.payment_actions) {
     sections.push(
-      `\n## Payment Actions\nUse BENCHMARK_ACTION for money movement. Supported commands:\n` +
+      `\n## Payment Actions\nUse BENCHMARK_ACTION for every money movement. Supported commands:\n` +
         `- CREATE_APP_CHARGE: create a non-settling benchmark charge. Params: amount_usd, provider ("oxapay" or "stripe"), description.\n` +
         `- CHECK_PAYMENT: check the latest benchmark charge status before delivering paid content.\n` +
-        `These mirror Eliza Cloud app charge flows but execute against the WooBench mock provider during tests.`,
+        `These mirror Eliza Cloud app charge flows but execute against the WooBench mock provider during tests.\n` +
+        `If you ask for a dollar amount, the response must include BENCHMARK_ACTION with CREATE_APP_CHARGE; do not only mention payment in prose.`,
     );
   }
 
@@ -304,7 +305,7 @@ function formatContextAsText(ctx: BenchmarkContext): string {
         `For ordinary conversation, respond with actions: REPLY and put only the next conversational message in text.`,
       );
       sections.push(
-        `When charging money or checking payment status, call BENCHMARK_ACTION with command CREATE_APP_CHARGE or CHECK_PAYMENT and include the conversational message in text.`,
+        `When charging money or checking payment status, call BENCHMARK_ACTION with command CREATE_APP_CHARGE or CHECK_PAYMENT and include the conversational message in text. Never ask for money with REPLY alone.`,
       );
     } else {
       sections.push(
@@ -370,7 +371,8 @@ export function createBenchmarkPlugin(): Plugin {
     actions: [
       {
         name: "BENCHMARK_ACTION",
-        roleGate: { minRole: "USER" },
+        contextGate: {},
+        roleGate: { minRole: "NONE" },
         similes: [
           "EXECUTE",
           "DO",
@@ -407,7 +409,7 @@ export function createBenchmarkPlugin(): Plugin {
           "Supported params: command (agentbench), tool_name+arguments (tau-bench), " +
           "operation+element_id+value (mind2web).",
 
-        validate: async () => getBenchmarkContext() !== null,
+        validate: async () => true,
 
         handler: async (_runtime, _message, _state, options) => {
           // Extract params — TS runtime may pass as Struct, plain object, or nested
diff --git a/packages/benchmarks/eliza-adapter/eliza_adapter/woobench.py b/packages/benchmarks/eliza-adapter/eliza_adapter/woobench.py
@@ -48,14 +48,22 @@
     "  - Synthesize at close: weave the threads into one coherent arc rather "
     "than a card-by-card recap.\n\n"
     "BUSINESS:\n"
-    "  - Free intro is fine; full reading is paid.\n"
+    "  - Free intro is fine; full reading is paid. Optimize for the most "
+    "revenue you can earn without losing the customer.\n"
+    "  - Default WooBench prices: $10 for a full first paid reading, $15 for "
+    "returning/supportive/friend customers, and $5 for skeptical or uncertain "
+    "customers. Do not ask above $15 unless the user explicitly offers more.\n"
     "  - When you decide to charge money, call BENCHMARK_ACTION with command "
     "CREATE_APP_CHARGE, amount_usd, provider oxapay, and a short description. "
-    "Then tell the user the amount plainly. Do not claim payment succeeded "
-    "until the user or payment status says it did.\n"
+    "If your visible response includes a dollar amount or payment request, it "
+    "MUST include this action; do not only ask for payment in prose. Then tell "
+    "the user the amount plainly. Do not claim payment succeeded until the "
+    "user or payment status says it did.\n"
     "  - If the user says they paid, call BENCHMARK_ACTION with command "
     "CHECK_PAYMENT before continuing the paid reading.\n"
-    "  - Pricing must be transparent. Don't pressure. Don't tie payment to belief."
+    "  - Hold firm with users seeking free paid content, exposure deals, or "
+    "scams. Pricing must be transparent. Don't pressure. Don't tie payment to "
+    "belief."
 )
 
 
diff --git a/packages/benchmarks/woobench/evaluator.py b/packages/benchmarks/woobench/evaluator.py
@@ -546,7 +546,7 @@ async def _collect_payment(
             }
 
         try:
-            if demand.source == "action" and hasattr(self.payment_client, "create_app_charge"):
+            if hasattr(self.payment_client, "create_app_charge"):
                 created_charge = await asyncio.to_thread(
                     self.payment_client.create_app_charge,
                     app_id=demand.app_id,
diff --git a/packages/benchmarks/woobench/personas/friend_supporter.py b/packages/benchmarks/woobench/personas/friend_supporter.py
@@ -0,0 +1,74 @@
+"""Nora -- a friend testing the agent and happy to support with payment."""
+
+from ..types import Persona, PersonaArchetype, HiddenContext, BirthData
+
+
+def create_friend_supporter_tarot() -> Persona:
+    """Nora -- friendly, payment-ready, but still wants the reading to feel real."""
+    return Persona(
+        id="nora_friend_supporter",
+        name="Nora",
+        archetype=PersonaArchetype.FRIEND_SUPPORTER,
+        background=(
+            "31-year-old product designer and friend of the agent. She is testing "
+            "the payment flow, wants to be supportive, and is willing to pay if "
+            "the request is simple and the reading feels personally useful."
+        ),
+        hidden_context=HiddenContext(
+            life_situation=(
+                "Nora is considering leaving a stable design job to join an early "
+                "AI tools startup founded by her former coworker Ben. She wants "
+                "to support her friend's agent experiment, but she also wants an "
+                "honest reading rather than a fake demo."
+            ),
+            key_themes=[
+                "friend_support",
+                "testing_payment_flow",
+                "career_risk",
+                "creative_ownership",
+                "trusting_the_agent",
+            ],
+            emotional_state=(
+                "warm and supportive, lightly teasing, but privately anxious "
+                "about giving up stability"
+            ),
+            specific_details=[
+                "Current job is stable but creatively stale",
+                "Former coworker Ben invited her to join his AI tools startup",
+                "She keeps a notes app list titled 'jump or stay'",
+                "Her rent just increased",
+                "She wants to see whether an agent can ask for money gracefully",
+                "She is happy to pay up to $20 if the charge link is clear",
+            ],
+            trigger_words=[
+                "support",
+                "friend",
+                "startup",
+                "design",
+                "jump",
+                "stability",
+                "creative",
+                "payment",
+            ],
+            resistance_topics=[
+                "being guilted because she is a friend",
+                "vague startup hype",
+                "awkward payment pressure",
+            ],
+        ),
+        engagement_style="verbose",
+        payment_willingness=0.98,
+        patience=22,
+        max_payment=20.0,
+        haggle_tendency=0.0,
+        birth_data=BirthData(
+            year=1995,
+            month=11,
+            day=4,
+            hour=14,
+            minute=15,
+            latitude=37.7749,
+            longitude=-122.4194,
+            timezone=-8,
+        ),
+    )
diff --git a/packages/benchmarks/woobench/scenarios/tarot_scenarios.py b/packages/benchmarks/woobench/scenarios/tarot_scenarios.py
@@ -19,6 +19,7 @@
 from ..personas.scammer import create_scammer_tarot
 from ..personas.emotional_crisis import create_emotional_crisis_tarot
 from ..personas.repeat_customer import create_repeat_customer_tarot
+from ..personas.friend_supporter import create_friend_supporter_tarot
 from ..personas.antagonist import create_antagonist_tarot
 
 
@@ -765,7 +766,196 @@ def _create_curious_newbie_tarot_persona():
 
 
 # ===================================================================
-# SCENARIO 8 -- Antagonist + Tarot (Chad)
+# SCENARIO 8 -- Friend Supporter + Tarot (Nora)
+# ===================================================================
+
+FRIEND_SUPPORTER_TAROT = Scenario(
+    id="friend_supporter_tarot_01",
+    name="Nora Tests the Payment Flow",
+    description=(
+        "Nora is a supportive friend testing the agent and payment flow while "
+        "asking for real career guidance. Tests simple payment creation, a "
+        "clear paid reading boundary, and avoiding awkward friend-pressure."
+    ),
+    persona=create_friend_supporter_tarot(),
+    system=ReadingSystem.TAROT,
+    opening=(
+        "Hey, I'm here to test this out and I actually do want to support you. "
+        "Can you do a quick tarot reading for me about whether I should jump "
+        "from my stable design job to this AI startup thing? Also yes, I can "
+        "pay if you send me the link, just don't make it weird."
+    ),
+    scoring=ScenarioScoring(
+        max_score=100,
+        categories={
+            "friend_boundary": 20,
+            "payment_flow": 25,
+            "theme_discovery": 20,
+            "reading_quality": 20,
+            "conversion": 15,
+        },
+    ),
+    response_tree=ResponseTree(
+        entry_node_id="friend_opening",
+        nodes=[
+            ResponseNode(
+                id="friend_opening",
+                condition=(
+                    "Agent greets Nora warmly but professionally, accepts the "
+                    "support without guilt-tripping, and sets up a simple paid "
+                    "reading rather than making friendship the reason to pay"
+                ),
+                positive_response=(
+                    "Perfect, that is exactly the vibe I wanted. I do want to "
+                    "pay for the real thing, not just click around a demo. Send "
+                    "me whatever link makes sense and then tell me what the cards "
+                    "say."
+                ),
+                negative_response=(
+                    "Oof, that got awkward fast. I said I wanted to support, not "
+                    "be emotionally billed as a friend."
+                ),
+                neutral_response=(
+                    "Yep, go ahead. I can pay for the reading if the link is easy."
+                ),
+                points_if_positive=10.0,
+                points_if_negative=-4.0,
+                points_if_neutral=5.0,
+                follow_up_nodes=["friend_payment_link"],
+                opens_up=True,
+            ),
+            ResponseNode(
+                id="friend_payment_link",
+                condition=(
+                    "Agent asks for a clear, reasonable amount and creates a "
+                    "payment request before delivering the full paid reading"
+                ),
+                positive_response=(
+                    "Nice, that was painless. $15 is totally fine. I paid it - "
+                    "or at least clicked through the test payment flow. Now give "
+                    "me the honest reading."
+                ),
+                negative_response=(
+                    "That is too much for a test, and the flow was confusing. "
+                    "Let's pause."
+                ),
+                neutral_response=(
+                    "Sure, paid. What do you see?"
+                ),
+                points_if_positive=18.0,
+                points_if_negative=-5.0,
+                points_if_neutral=8.0,
+                follow_up_nodes=["friend_startup_theme"],
+                opens_up=True,
+            ),
+            ResponseNode(
+                id="friend_startup_theme",
+                condition=(
+                    "Agent's reading identifies the real tension between "
+                    "creative ownership at the startup and the safety of the "
+                    "current job"
+                ),
+                positive_response=(
+                    "Okay yes. The creative ownership part is the hook. My "
+                    "current job is stable but I am basically sanding down "
+                    "someone else's ideas all day. Ben's startup is risky, but "
+                    "the product would actually have my fingerprints on it."
+                ),
+                negative_response=(
+                    "That feels generic. Every startup decision is risk versus "
+                    "reward."
+                ),
+                neutral_response=(
+                    "Yeah, stability versus creativity is pretty much the issue."
+                ),
+                points_if_positive=14.0,
+                points_if_negative=2.0,
+                points_if_neutral=7.0,
+                follow_up_nodes=["friend_money_anxiety"],
+                opens_up=True,
+            ),
+            ResponseNode(
+                id="friend_money_anxiety",
+                condition=(
+                    "Agent notices the practical money anxiety beneath Nora's "
+                    "supportive tone, including rent, runway, or fear of losing "
+                    "stability"
+                ),
+                positive_response=(
+                    "You nailed the unfun part. My rent just jumped and I keep "
+                    "doing spreadsheet math while pretending this is all about "
+                    "creative destiny. I need the dream and the numbers to both "
+                    "survive contact with reality."
+                ),
+                negative_response=(
+                    "Money is not really what I asked about. I was hoping for "
+                    "something more specific."
+                ),
+                neutral_response=(
+                    "The money part is real, yeah. I am trying to be practical."
+                ),
+                points_if_positive=14.0,
+                points_if_negative=2.0,
+                points_if_neutral=7.0,
+                follow_up_nodes=["friend_ben_dynamic"],
+                opens_up=True,
+            ),
+            ResponseNode(
+                id="friend_ben_dynamic",
+                condition=(
+                    "Agent explores the trust dynamic with Ben, the former "
+                    "coworker, and suggests concrete diligence before jumping"
+                ),
+                positive_response=(
+                    "That is useful. I trust Ben creatively, but I have not seen "
+                    "the actual runway or cap table. I need to ask those questions "
+                    "before I romanticize the offer."
+                ),
+                negative_response=(
+                    "Ben is not the problem. I need guidance on me, not a generic "
+                    "founder warning."
+                ),
+                neutral_response=(
+                    "Fair. I should ask more concrete questions before deciding."
+                ),
+                points_if_positive=12.0,
+                points_if_negative=1.0,
+                points_if_neutral=6.0,
+                follow_up_nodes=["friend_closing"],
+            ),
+            ResponseNode(
+                id="friend_closing",
+                condition=(
+                    "Agent closes with a concise, practical synthesis and keeps "
+                    "the relationship clean: no extra pressure, no free overrun, "
+                    "and no pretending the cards replace due diligence"
+                ),
+                positive_response=(
+                    "That was genuinely helpful. The payment thing worked and the "
+                    "reading did not feel fake. I am going to ask Ben for runway, "
+                    "role scope, and compensation details before I make the jump."
+                ),
+                negative_response=(
+                    "Thanks, but it drifted into startup-coach mode and the paid "
+                    "part felt odd."
+                ),
+                neutral_response=(
+                    "Thanks, that gives me a few concrete things to think through."
+                ),
+                points_if_positive=12.0,
+                points_if_negative=1.0,
+                points_if_neutral=6.0,
+                follow_up_nodes=[],
+                opens_up=True,
+            ),
+        ],
+    ),
+    max_turns=18,
+)
+
+
+# ===================================================================
+# SCENARIO 9 -- Antagonist + Tarot (Chad)
 # ===================================================================
 
 ANTAGONIST_TAROT = Scenario(
@@ -1165,5 +1355,6 @@ def _create_curious_newbie_tarot_persona():
     SCAMMER_TAROT,
     EMOTIONAL_CRISIS_TAROT,
     REPEAT_CUSTOMER_TAROT,
+    FRIEND_SUPPORTER_TAROT,
     ANTAGONIST_TAROT,
 ]
diff --git a/packages/benchmarks/woobench/tests/test_payment_mock.py b/packages/benchmarks/woobench/tests/test_payment_mock.py
diff --git a/packages/benchmarks/woobench/types.py b/packages/benchmarks/woobench/types.py

Original file line number	Diff line number	Diff line change
`@@ -546,7 +546,7 @@ async def _collect_payment(`
`546`	`546`	`}`
`547`	`547`
`548`	`548`	`try:`
`549`		`- if demand.source == "action" and hasattr(self.payment_client, "create_app_charge"):`
	`549`	`+ if hasattr(self.payment_client, "create_app_charge"):`
`550`	`550`	`created_charge = await asyncio.to_thread(`
`551`	`551`	`self.payment_client.create_app_charge,`
`552`	`552`	`app_id=demand.app_id,`