1414# See the License for the specific language governing permissions and
1515# limitations under the License.
1616
17+ import math
1718from typing import List
1819
1920
@@ -33,3 +34,95 @@ def identity_reward(observation: List[float]) -> float:
3334 if observation :
3435 return observation [0 ]
3536 return 0.0
37+
38+
39+ def ai_dynamo_weighted_normalized_reward (observation : List [float ]) -> float :
40+ """Calculate reward using AI Dynamo's custom metrics."""
41+ ttft_idx = 0
42+ itl_idx = 1
43+ throughput_idx = 2
44+
45+ # Normalization
46+ ttft_baseline = 0.3 # seconds
47+ itl_baseline = 0.02 # seconds
48+ throughput_baseline = 50.0 # tokens/s
49+
50+ # Weighting between metrics - equal focus on TTFT and throughput
51+ ttft_weight = 0.45
52+ itl_weight = 0.1
53+ throughput_weight = 0.45
54+
55+ if len (observation ) < 3 :
56+ return - 1.0
57+
58+ ttft = observation [ttft_idx ]
59+ itl = observation [itl_idx ]
60+ throughput = observation [throughput_idx ]
61+
62+ ttft_reward = ttft_baseline / ttft
63+ itl_reward = itl_baseline / itl
64+
65+ throughput_reward = throughput / throughput_baseline
66+
67+ # Weighted combined reward
68+ reward = ttft_weight * ttft_reward + itl_weight * itl_reward + throughput_weight * throughput_reward
69+
70+ return reward
71+
72+
73+ def ai_dynamo_ratio_normalized_reward (observation : List [float ]) -> float :
74+ """Calculate reward as normalized throughput divided by latency metrics."""
75+ ttft_idx = 0
76+ itl_idx = 1
77+ throughput_idx = 2
78+
79+ ttft_baseline = 1.0 # seconds (1000ms)
80+ itl_baseline = 0.03 # seconds (30ms)
81+ throughput_baseline = 1000.0 # tokens/s
82+
83+ if len (observation ) < 3 :
84+ return 0.0
85+
86+ ttft = observation [ttft_idx ]
87+ itl = observation [itl_idx ]
88+ throughput = observation [throughput_idx ]
89+
90+ if ttft <= 0 or itl <= 0 or throughput <= 0 :
91+ return 0.0
92+
93+ ttft_norm = ttft / ttft_baseline
94+ itl_norm = itl / itl_baseline
95+ throughput_norm = throughput / throughput_baseline
96+
97+ reward = throughput_norm / (ttft_norm * itl_norm )
98+
99+ return reward
100+
101+
102+ def ai_dynamo_log_scale_reward (observation : List [float ]) -> float :
103+ """
104+ Calculate reward using log-scale metrics focused on throughput and TTFT.
105+
106+ Since ITL is already optimized, we focus on the primary metrics.
107+ """
108+ ttft_idx = 0
109+ itl_idx = 1
110+ throughput_idx = 2
111+
112+ if len (observation ) < 3 :
113+ return 0.0
114+
115+ ttft = observation [ttft_idx ]
116+ itl = observation [itl_idx ]
117+ throughput = observation [throughput_idx ]
118+
119+ if ttft <= 0 or itl <= 0 or throughput <= 0 :
120+ return - 1e-3
121+
122+ throughput_reward = math .log (throughput + 1 )
123+ ttft_penalty = math .log (ttft + 1 )
124+ itl_penalty = math .log (itl + 1 )
125+
126+ reward = throughput_reward - 0.7 * ttft_penalty - 0.1 * itl_penalty
127+
128+ return reward
0 commit comments