1+ #include < algorithm>
2+ #include < chrono>
3+ #include < cstdint>
4+ #include < cstring>
5+ #include < fstream>
6+ #include < iomanip>
7+ #include < iostream>
8+ #include < random>
9+ #include < sstream>
10+ #include < string>
11+
12+ // ==================== 128位分数类(同前) ====================
13+ struct Fraction {
14+ __int128 num, den;
15+
16+ Fraction (__int128 n = 0 , __int128 d = 1 ) : num(n), den(d) {
17+ if (den < 0 ) {
18+ num = -num;
19+ den = -den;
20+ }
21+ reduce ();
22+ }
23+
24+ void reduce () {
25+ if (den == 0 )
26+ throw std::runtime_error (" Denominator zero" );
27+ __int128 g = gcd (num < 0 ? -num : num, den);
28+ num /= g;
29+ den /= g;
30+ }
31+
32+ static __int128 gcd (__int128 a, __int128 b) {
33+ while (b) {
34+ __int128 t = b;
35+ b = a % b;
36+ a = t;
37+ }
38+ return a < 0 ? -a : a;
39+ }
40+
41+ bool operator <(const Fraction& rhs) const { return num * rhs.den < rhs.num * den; }
42+ bool operator <=(const Fraction& rhs) const { return num * rhs.den <= rhs.num * den; }
43+ bool operator ==(const Fraction& rhs) const { return num == rhs.num && den == rhs.den ; }
44+ bool operator !=(const Fraction& rhs) const { return !(*this == rhs); }
45+ bool operator >(const Fraction& rhs) const { return rhs < *this ; }
46+ bool operator >=(const Fraction& rhs) const { return rhs <= *this ; }
47+
48+ Fraction operator +(const Fraction& rhs) const { return Fraction (num * rhs.den + rhs.num * den, den * rhs.den ); }
49+ Fraction operator -(const Fraction& rhs) const { return Fraction (num * rhs.den - rhs.num * den, den * rhs.den ); }
50+ Fraction operator *(const Fraction& rhs) const { return Fraction (num * rhs.num , den * rhs.den ); }
51+ Fraction operator /(const Fraction& rhs) const { return Fraction (num * rhs.den , den * rhs.num ); }
52+
53+ __int128 floor () const {
54+ if (num >= 0 )
55+ return num / den;
56+ else
57+ return (num - den + 1 ) / den;
58+ }
59+
60+ Fraction frac () const { return *this - Fraction (floor (), 1 ); }
61+ };
62+
63+ Fraction pow (int base, int exp) {
64+ if (exp == 0 )
65+ return Fraction (1 );
66+ if (exp < 0 )
67+ return Fraction (1 ) / pow (base, -exp);
68+ __int128 result = 1 ;
69+ for (int i = 0 ; i < exp; ++i)
70+ result *= base;
71+ return Fraction (result);
72+ }
73+
74+ // ==================== FP16 解析(同前) ====================
75+ struct FP16Components {
76+ uint16_t bits;
77+ __int128 c;
78+ int q;
79+ bool is_regular;
80+ bool is_irregular;
81+ };
82+
83+ FP16Components f16_to_components (uint16_t bits) {
84+ int exp = (bits >> 10 ) & 0x1F ;
85+ int frac = bits & 0x3FF ;
86+
87+ FP16Components comp;
88+ comp.bits = bits;
89+ if (exp == 0 ) {
90+ comp.c = frac;
91+ comp.q = -24 ;
92+ comp.is_regular = true ;
93+ comp.is_irregular = false ;
94+ } else {
95+ comp.c = 1024 + frac;
96+ comp.q = exp - 25 ;
97+ comp.is_regular = (frac != 0 );
98+ comp.is_irregular = (frac == 0 );
99+ }
100+ return comp;
101+ }
102+
103+ int compute_k (int q, bool is_regular) {
104+ int k;
105+ if (is_regular) {
106+ k = (q * 1233 ) >> 12 ;
107+ } else {
108+ k = ((q * 1233 ) - 512 ) >> 12 ;
109+ }
110+ return k;
111+ }
112+
113+ std::pair<uint32_t , int > f16_to_decimal (uint16_t bits) {
114+ auto comp = f16_to_components (bits);
115+ __int128 c = comp.c ;
116+ int q = comp.q ;
117+ bool is_regular = comp.is_regular ;
118+ bool is_irregular = comp.is_irregular ;
119+
120+ int k = compute_k (q, is_regular);
121+ Fraction v = Fraction (c) * pow (2 , q);
122+ Fraction R = v * pow (10 , -k - 1 );
123+ __int128 m = R.floor ();
124+ Fraction n = R.frac ();
125+
126+ __int128 ten = 10 * m;
127+ Fraction tenR = v * pow (10 , -k);
128+ Fraction ten_n = tenR - Fraction (ten);
129+ __int128 floor_ten_n = ten_n.floor ();
130+ Fraction delta = ten_n.frac ();
131+
132+ __int128 one;
133+ if (delta == Fraction (1 , 2 )) {
134+ if (floor_ten_n % 2 == 0 )
135+ one = floor_ten_n;
136+ else
137+ one = floor_ten_n + 1 ;
138+ } else if (delta < Fraction (1 , 2 )) {
139+ one = floor_ten_n;
140+ } else {
141+ one = floor_ten_n + 1 ;
142+ }
143+ Fraction A = pow (2 , q - 1 ) * pow (10 , -k - 1 );
144+ if (is_irregular) {
145+ Fraction cond1 = pow (2 , q - 2 ) * pow (10 , -k);
146+ Fraction cond2 = pow (2 , q - 2 ) * pow (10 , -k - 1 );
147+ if (delta > cond1)
148+ one = floor_ten_n + 1 ;
149+ if (cond2 >= n)
150+ one = 0 ;
151+ } else {
152+ if (A > n || (A == n && c % 2 == 0 ))
153+ one = 0 ;
154+ }
155+ if (A > (Fraction (1 ) - n) || (A == (Fraction (1 ) - n) && c % 2 == 0 ))
156+ one = 10 ;
157+ __int128 d = ten + one;
158+ return {static_cast <uint32_t >(d), k};
159+ }
160+
161+ // ==================== BCD 辅助 ====================
162+ static inline uint32_t to_bcd4 (uint32_t d) {
163+ // convert binary to little_endian bcd;
164+ // require d range [0,9999];
165+ uint64_t abcd = d;
166+ uint64_t ab_cd = (abcd << 16 ) + (1 - (100 << 16 )) * (((abcd * 0x147b ) >> 19 ));
167+ uint64_t a_b_c_d = (ab_cd << 8 ) + (1 - (10 << 8 )) * (((ab_cd * 0x67 ) >> 10 ) & 0x000f000f );
168+ return static_cast <uint32_t >(a_b_c_d);
169+ }
170+
171+ // ==================== xjb16 核心转换函数 ====================
172+ char * xjb16 (uint16_t bits, char * buf) {
173+ // same result as str(numpy.float16(v)) in python.
174+ // v has 16bits binary representation = bits
175+
176+ // 16 = 1 + 5 + 10; sign + exp + sig
177+ *buf = ' -' ;
178+ buf += bits >> (10 + 5 );
179+ uint32_t exp = (bits >> 10 ) & ((1 << 5 ) - 1 );
180+ uint32_t sig = bits & ((1 << 10 ) - 1 );
181+ uint32_t sig_bin = sig | (1 << 10 );
182+ int32_t exp_bin = exp - ((1 << 4 ) - 1 ) - 10 ;
183+ if (exp == 0 ) [[unlikely]] {
184+ if (sig == 0 )
185+ return (char *)memcpy (buf, " 0.0" , 4 ) + 3 ;
186+ exp_bin = 1 - ((1 << 4 ) - 1 ) - 10 ;
187+ sig_bin = sig;
188+ }
189+ if (exp == 31 ) [[unlikely]]
190+ return (char *)memcpy (buf, sig ? " nan" : " inf" , 4 ) + 3 ;
191+
192+ uint16_t bits_abs = bits & ((1 << 15 ) - 1 );
193+ auto [d, k] = f16_to_decimal (bits_abs);
194+ int tz = -1 ;
195+ int mul = 1 ;
196+ int d_div = d;
197+ while (d_div * mul == d) {
198+ d_div /= 10 ;
199+ mul *= 10 ;
200+ tz++;
201+ }
202+
203+ // len range = [1, 5];
204+ // d range = [6, 19990];
205+ // f16 min = 6e-8; max is 65500
206+ uint32_t len = 1 + (d >= 10 ) + (d >= 100 ) + (d >= 1000 ) + (d >= 10000 );
207+ memcpy (buf, " 00000000" , 8 );
208+ uint32_t d_rem = (d >= 10000 ) ? (d - 10000 ) : d;
209+
210+ int len_2 = len;
211+ while (len_2 < 4 ) {
212+ len_2++;
213+ d_rem *= 10 ;
214+ }
215+
216+ uint32_t bcd = to_bcd4 (d_rem);
217+ uint32_t ascii = bcd | 0x30303030 ;
218+ uint32_t dec_sig_len = len - tz; // [1, 5];
219+
220+ const int FIXED_MIN = -4 ;
221+ const int FIXED_MAX = 2 ;
222+
223+ int e10 = k + len - 1 ;
224+ uint32_t first_sig_pos = (FIXED_MIN <= e10 && e10 <= -1 ) ? 1 - e10 : 0 ;
225+ uint32_t dot_pos = (0 <= e10 && e10 <= FIXED_MAX ) ? 1 + e10 : 1 ;
226+ uint32_t move_pos = dot_pos + ((0 <= e10 || e10 < FIXED_MIN ));
227+
228+ uint32_t exp_pos = (FIXED_MIN <= e10 && e10 <= -1 )
229+ ? dec_sig_len
230+ : (0 <= e10 && e10 <= FIXED_MAX ? (e10 + 3 > dec_sig_len + 1 ? e10 + 3 : dec_sig_len + 1 )
231+ : (dec_sig_len + 1 - (dec_sig_len == 1 )));
232+ char * buf_origin = buf;
233+
234+ buf += first_sig_pos;
235+ *buf = ' 1' ;
236+ memcpy (buf + (d >= 10000 ), &ascii, 4 );
237+
238+ memmove (&buf[move_pos], &buf[dot_pos], 8 ); // the index (first_sig_pos + dot_pos + sign) max = 7+1=8,
239+ buf_origin[dot_pos] = ' .' ;
240+ buf += exp_pos;
241+
242+ // comput exp ascii;
243+ // only negative value print exp_result;
244+ int e10_neg = e10 < 0 ;
245+ uint32_t e10_abs = e10_neg ? -e10 : e10 ;
246+ uint32_t exp_result =
247+ ' e' + ((e10_neg ? (uint32_t )' -' : (uint32_t )' +' ) << 8 ) + ((uint32_t )' 0' << 16 ) + ((e10_abs + ' 0' ) << 24 );
248+ if (FIXED_MIN <= e10 && e10 <= FIXED_MAX )
249+ exp_result = 0 ;
250+ uint32_t exp_len = (FIXED_MIN <= e10 && e10 <= FIXED_MAX ) ? 0 : 4 ;
251+ memcpy (buf, &exp_result, 4 );
252+ return buf + exp_len;
253+ }
254+
255+ // ==================== 性能测试主程序 ====================
256+ int main () {
257+ // 生成 N = 2^20 个随机 FP16 值,排除 inf 和 nan
258+ constexpr size_t N = 1 << 20 ;
259+ std::vector<uint16_t > values;
260+ values.reserve (N);
261+
262+ std::random_device rd;
263+ std::mt19937 gen (rd ());
264+ std::uniform_int_distribution<uint16_t > dist (0 , 65535 );
265+
266+ while (values.size () < N) {
267+ uint16_t bits = dist (gen);
268+ uint16_t exp = (bits >> 10 ) & 0x1F ;
269+ if (exp != 31 ) { // 排除 exp == 31 (inf/nan)
270+ values.push_back (bits);
271+ }
272+ }
273+
274+ // 缓冲区复用,避免多次分配栈内存影响测量(测量的是转换逻辑本身)
275+ char buffer[32 ]; // 足够大,xjb16 最多写十几个字符
276+
277+ // 预热:调用一次确保缓存和分支预测稳定
278+ (void )xjb16 (values[0 ], buffer);
279+
280+ // 开始计时
281+ auto start = std::chrono::high_resolution_clock::now ();
282+
283+ // 执行 N 次转换,并将结果通过 volatile 防止优化
284+ volatile char sink = 0 ;
285+ for (uint16_t bits : values) {
286+ char * end = xjb16 (bits, buffer);
287+ // 强制编译器认为结果被使用(防止循环被完全优化掉)
288+ sink += buffer[0 ] + (end - buffer);
289+ }
290+
291+ auto end = std::chrono::high_resolution_clock::now ();
292+ auto duration_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count ();
293+ double avg_ns = (duration_us * 1000.0 ) / N;
294+
295+ // 输出结果
296+ std::cout << " Total operations: " << N << " \n " ;
297+ std::cout << " Total time: " << duration_us << " us\n " ;
298+ std::cout << " Average time per call: " << avg_ns << " ns\n " ;
299+
300+ return 0 ;
301+ }
0 commit comments