external-storage-demo/generate_expenses.py at main · drewhoskins-temporal/external-storage-demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""Generate realistic synthetic expense data for the NickelAndDime demo.

20 employees. 18 have normal expense patterns. 2 are AI-heavy (CTO / VP AI
Research) whose transaction lists exceed Temporal's 2MB payload limit when
JSON-encoded, which is the failure the external-storage demo resolves.

One of the AI-heavy lists also contains a $0 charge. That doesn't fail in
this branch, but it's the bug the codec-server arc will surface later.

Run: `python generate_expenses.py` -> writes expenses.json in this dir.
"""

import json
import random
from pathlib import Path

SEED = 42
# ~13k token-charge transactions x ~230 bytes each ≈ 3MB, comfortably over
# Temporal's default 2MB blob-size limit.
AI_HEAVY_TXN_COUNT = 13000

MERCHANTS = {
    "Cloud Services": [
        ("merch-aws", "Amazon Web Services"),
        ("merch-gcp", "Google Cloud Platform"),
        ("merch-azure", "Microsoft Azure"),
        ("merch-cloudflare", "Cloudflare"),
        ("merch-datadog", "Datadog"),
    ],
    "AI Services": [
        ("merch-anthropic", "Anthropic"),
        ("merch-openai", "OpenAI"),
        ("merch-cohere", "Cohere"),
        ("merch-huggingface", "Hugging Face"),
        ("merch-replicate", "Replicate"),
    ],
    "Software": [
        ("merch-github", "GitHub"),
        ("merch-jetbrains", "JetBrains"),
        ("merch-notion", "Notion"),
        ("merch-linear", "Linear"),
        ("merch-slack", "Slack"),
        ("merch-figma", "Figma"),
        ("merch-1password", "1Password"),
    ],
    "Travel": [
        ("merch-delta", "Delta Airlines"),
        ("merch-united", "United Airlines"),
        ("merch-uber", "Uber"),
        ("merch-lyft", "Lyft"),
        ("merch-marriott", "Marriott"),
        ("merch-airbnb", "Airbnb"),
    ],
    "Meals": [
        ("merch-doordash", "DoorDash"),
        ("merch-grubhub", "Grubhub"),
        ("merch-starbucks", "Starbucks"),
        ("merch-bluebottle", "Blue Bottle Coffee"),
        ("merch-chipotle", "Chipotle"),
        ("merch-sweetgreen", "Sweetgreen"),
    ],
    "Office Supplies": [
        ("merch-amazon-biz", "Amazon Business"),
        ("merch-staples", "Staples"),
        ("merch-bestbuy", "Best Buy"),
    ],
    "Hardware": [
        ("merch-dell", "Dell"),
        ("merch-apple-biz", "Apple Business"),
        ("merch-lenovo", "Lenovo"),
    ],
}

DESCRIPTIONS = {
    "Cloud Services": [
        "Monthly compute - production cluster",
        "S3 storage and data transfer",
        "Kubernetes cluster operations",
        "CDN bandwidth charges",
        "Load balancer and networking",
        "RDS database instances",
        "Monitoring and log retention",
    ],
    "AI Services": [
        "API usage - model inference",
        "Fine-tuning compute job",
        "Embeddings API calls",
        "Claude API - prompt testing",
        "GPT-4 API - chat completion",
        "Token usage - experimental pipeline",
        "Rate-limited retry batch",
        "Prompt engineering sandbox",
        "Nightly eval run",
        "Agent framework spike",
    ],
    "Software": [
        "Annual subscription renewal",
        "Team seat - new hire",
        "Enterprise plan upgrade",
        "Monthly subscription",
        "Pro plan - individual seat",
    ],
    "Travel": [
        "Flight - customer visit SFO-NYC",
        "Ride to airport",
        "Conference hotel stay",
        "Customer on-site travel",
        "Team offsite lodging",
        "Sales kickoff travel",
    ],
    "Meals": [
        "Team lunch",
        "Client dinner",
        "Offsite catering",
        "Morning coffee run",
        "Late-night coding fuel",
        "Interview lunch",
    ],
    "Office Supplies": [
        "Monitor replacement",
        "Desk accessories",
        "Ergonomic keyboard",
        "Home office setup",
    ],
    "Hardware": [
        "Developer laptop",
        "External monitor",
        "Docking station",
        "Replacement battery",
    ],
}

AMOUNT_RANGES = {
    "Cloud Services": (50, 5000),
    "AI Services": (10, 500),
    "Software": (10, 500),
    "Travel": (20, 2000),
    "Meals": (8, 300),
    "Office Supplies": (15, 800),
    "Hardware": (200, 3500),
}

EMPLOYEES = [
    ("emp-001", "Sarah Chen", "Engineering", "normal"),
    ("emp-002", "Marcus Rivera", "Sales", "normal"),
    ("emp-003", "Priya Patel", "Engineering", "normal"),
    ("emp-004", "James O'Brien", "Marketing", "normal"),
    ("emp-005", "Aisha Khan", "Finance", "normal"),
    ("emp-006", "Diego Fernandez", "Engineering", "normal"),
    ("emp-007", "Yuki Tanaka", "CTO", "ai_heavy"),  # big
    ("emp-008", "Rachel Goldberg", "Sales", "normal"),
    ("emp-009", "Tom Blackwood", "Engineering", "normal"),
    ("emp-010", "Olivia Nakamura", "Design", "normal"),
    ("emp-011", "Raj Kumar", "Engineering", "normal"),
    ("emp-012", "Nina Petrov", "Operations", "normal"),
    ("emp-013", "Luis Hernandez", "VP AI Research", "ai_heavy"),  # big
    ("emp-014", "Zoe Wright", "Marketing", "normal"),
    ("emp-015", "Kwame Asante", "Engineering", "normal"),
    ("emp-016", "Anja Mueller", "Sales", "normal"),
    ("emp-017", "Carlos Mendes", "Engineering", "normal"),
    ("emp-018", "Isla MacDonald", "People Ops", "normal"),
    ("emp-019", "Ravi Subramanian", "Engineering", "normal"),
    ("emp-020", "Fatima Al-Hassan", "Finance", "normal"),
]


def random_date(rng: random.Random) -> str:
    month = rng.randint(1, 4)
    day = rng.randint(1, 28)
    return f"2026-{month:02d}-{day:02d}"


def txn_id(counter: int) -> str:
    return f"txn-{counter:010d}"


def normal_transaction(rng: random.Random, counter: int) -> dict:
    category = rng.choice(list(MERCHANTS.keys()))
    merch_id, merch_name = rng.choice(MERCHANTS[category])
    desc = rng.choice(DESCRIPTIONS[category])
    lo, hi = AMOUNT_RANGES[category]
    return {
        "transaction_id": txn_id(counter),
        "date": random_date(rng),
        "category": category,
        "merchant_id": merch_id,
        "merchant_name": merch_name,
        "amount": round(rng.uniform(lo, hi), 2),
        "description": desc,
    }


def ai_token_transaction(rng: random.Random, counter: int) -> dict:
    merch_id, merch_name = rng.choice(MERCHANTS["AI Services"])
    desc = rng.choice(DESCRIPTIONS["AI Services"])
    return {
        "transaction_id": txn_id(counter),
        "date": random_date(rng),
        "category": "AI Services",
        "merchant_id": merch_id,
        "merchant_name": merch_name,
        "amount": round(rng.uniform(0.01, 50.0), 4),
        "description": desc,
    }


def build_employee(rng, emp_id, name, role, profile, counter):
    txns = []
    if profile == "ai_heavy":
        # Some normal background expenses
        for _ in range(rng.randint(60, 120)):
            txns.append(normal_transaction(rng, counter))
            counter += 1
        # Pile on AI token charges — enough to push payload over the 2MB limit
        for _ in range(AI_HEAVY_TXN_COUNT):
            txns.append(ai_token_transaction(rng, counter))
            counter += 1
        # Seed a $0 charge (surfaces in the codec-server debugging arc)
        if emp_id == "emp-007":
            zero_idx = len(txns) // 2
            txns[zero_idx]["amount"] = 0.0
            txns[zero_idx]["description"] = "Token usage - failed batch retry"
    else:
        for _ in range(rng.randint(30, 150)):
            txns.append(normal_transaction(rng, counter))
            counter += 1

    rng.shuffle(txns)
    return {
        "employee_id": emp_id,
        "name": name,
        "role": role,
        "transactions": txns,
    }, counter


def main():
    rng = random.Random(SEED)
    counter = 1
    employees = []
    for emp_id, name, role, profile in EMPLOYEES:
        emp, counter = build_employee(rng, emp_id, name, role, profile, counter)
        size_kb = len(json.dumps(emp["transactions"])) / 1024
        marker = "  <-- LARGE" if size_kb > 2048 else ""
        print(f"{emp_id:8s} {name:22s} {role:16s} {len(emp['transactions']):6d} txns  {size_kb:8.1f} KB{marker}")
        employees.append(emp)

    out = Path(__file__).parent / "expenses.json"
    with out.open("w") as f:
        json.dump({"employees": employees}, f, indent=2)
    print(f"\nWrote {out} ({out.stat().st_size / 1024 / 1024:.2f} MB total)")


if __name__ == "__main__":
    main()