xjb714
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bench/xjb/float_to_string/ftoa.cpp‎
Lines changed: 43 additions & 24 deletions b/‎bench/xjb/float_to_string/ftoa.cpp‎
Lines changed: 43 additions & 24 deletions
diff --git a/‎bench/xjb/perf/makefile‎
Lines changed: 22 additions & 18 deletions b/‎bench/xjb/perf/makefile‎
Lines changed: 22 additions & 18 deletions
diff --git a/‎bench/xjb/test/f16_to_decimal.cpp‎
Lines changed: 5 additions & 0 deletions b/‎bench/xjb/test/f16_to_decimal.cpp‎
Lines changed: 5 additions & 0 deletions
@@ -21,6 +21,8 @@ other_benchmark_project/bench_16digit_to_ascii/verify
 bench/xjb/perf/main_g
 bench/xjb/perf/main_c
 bench/xjb/perf/main_i
+bench/xjb/perf/*.a
+bench/xjb/perf/*.o
 
 bench/xjb/*.s
 bench/xjb/perf/*.s
 
@@ -1,9 +1,16 @@
-// author : xjb
-// src : github.com/xjb714/xjb
-// date : 2026.2.2
-
-// todo : big-endian support, msvc support, optimize for performance, add
-// comments, reduce code size, etc.
+// Copyright 2026 xjb714 and contributors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "ftoa.h"
 
@@ -325,19 +332,20 @@ static const struct const_value_float constants_float = {
     .m32_4 = {0x147b000, -100 + 0x10000, 0xce0, -10 + 0x100},
 };
 
+// size: 17728, align: 64
 struct double_table_t {
     static constexpr int e10_DN = -4;
     static constexpr int e10_UP = 15;
     static constexpr int max_dec_sig_len = 17;
     static constexpr int num_pow10 = 323 - (-293) + 1;
-    uint64_t pow10_double[(323 - (-293) + 1) * 2] = {};
-    uint64_t exp_result_double[324 + 308 + 1] = {};
-    alignas(64) unsigned char e10_variable_data[e10_UP - e10_DN + 1 + 1][64] = {};
+    uint64_t pow10_double[(323 - (-293) + 1) * 2] = {};  // 1234 * 8 = 9872 bytes
+    uint64_t exp_result_double[324 + 308 + 1] = {};      // 633 * 8 = 5064 bytes
+    alignas(64) unsigned char e10_variable_data[e10_UP - e10_DN + 1 + 1][XJB_NO_MEMMOVE ? 64 : 32] = {};
     unsigned char h7[2048] = {};
 
-    // uint8_t shuffle_table[17] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-    // 13, 14, 15, 0}; uint8_t shuffle_table_big_endian[17] = {0, 7, 6, 5, 4, 3,
-    // 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+    /* Assert size of per line in e10_variable_data is enough. */
+    static_assert(32 >= max_dec_sig_len + 5, "");
+
     constexpr double_table_t() {
         struct uint192 {
             uint64_t w0, w1, w2;
@@ -393,16 +401,18 @@ struct double_table_t {
                                                                : (dec_sig_len + 1 - (dec_sig_len == 1)));
                 e10_variable_data[tmp_data_ofs][dec_sig_len - 1] = exp_pos;
             }
-            uint8_t v = 0xf;
-            for (uint64_t j = 0; j < 16; ++j)
-                e10_variable_data[tmp_data_ofs][32 + 16 + j] = v--;
-            if (move_pos > dot_pos) {
-                for (uint64_t j = 15; j > dot_pos && j > 0; --j)
-                    e10_variable_data[tmp_data_ofs][j + 32 + 16] = e10_variable_data[tmp_data_ofs][j + 32 + 16 - 1];
-            }
-            for (uint64_t j = 0; j < 16; ++j) {
-                auto v = e10_variable_data[tmp_data_ofs][j + 32 + 16];
-                e10_variable_data[tmp_data_ofs][j + 32] = v ? (v - 1) : 15;
+            if (XJB_NO_MEMMOVE) {
+                uint8_t v = 0xf;
+                for (uint64_t j = 0; j < 16; ++j)
+                    e10_variable_data[tmp_data_ofs][32 + 16 + j] = v--;
+                if (move_pos > dot_pos) {
+                    for (uint64_t j = 15; j > dot_pos && j > 0; --j)
+                        e10_variable_data[tmp_data_ofs][j + 32 + 16] = e10_variable_data[tmp_data_ofs][j + 32 + 16 - 1];
+                }
+                for (uint64_t j = 0; j < 16; ++j) {
+                    auto v = e10_variable_data[tmp_data_ofs][j + 32 + 16];
+                    e10_variable_data[tmp_data_ofs][j + 32] = v ? (v - 1) : 15;
+                }
             }
         }
         for (int exp = 0; exp < 2048; ++exp) {
@@ -440,22 +450,31 @@ struct const_value_double {
     int32_t multipliers32[4] = {0x68db8bb, -10000 + 0x10000, 0x147b000, -100 + 0x10000};  // 16
     int16_t multipliers16[8] = {0xce0, -10 + 0x100, '0' + '0' * 256};                     // 16
 #endif
+#if XJB_USE_NEON
     uint8_t shuffle_table_neon[32] = {7, 6, 5, 4, 3, 2, 1, 0,  15, 14, 13, 12, 11, 10, 9, 8,
                                       6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9,  8, 7};
+#endif
+#if XJB_USE_NEON && XJB_NO_MEMMOVE
     uint8_t reverse_shuffle_table[17] = {0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+#endif
+#if (XJB_NOT_REMOVE_FIRST_ZERO && XJB_USE_SSSE3) || XJB_USE_NEON
     uint8_t shuffle_table[17] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0};
+#endif
 };
 
-
 struct float_table_t {
     static const int e10_DN = -3;
     static const int e10_UP = 6;
     static const int max_dec_sig_len = 9;
     static const int num_pow10 = 44 - (-32) + 1;
     uint64_t pow10_float_reverse[44 - (-32) + 1] = {};
     uint32_t exp_result_float[45 + 38 + 1] = {};
-    unsigned char e10_variable_data[e10_UP - (e10_DN) + 1 + 1][1 ? 16 : max_dec_sig_len + 3] = {};
+    unsigned char e10_variable_data[e10_UP - (e10_DN) + 1 + 1][16] = {};
     unsigned char h37[256] = {};
+
+    /* Assert size of per line in e10_variable_data is enough. */
+    static_assert(16 >= max_dec_sig_len + 3, "");
+
     struct const_value_float constants_float = {
 #if defined(__aarch64__)
         .c1 = (((u64)('0' + '0' * 256) << (36)) + (((u64)1 << (36 - 1)) - 7)),
 
@@ -1,43 +1,47 @@
+GCC = g++
+ICPX = icpx
+CLANG = clang++
+
 
 # with no-PIC
 # /////////////////////////////
 io:
-	icpx -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(ICPX) -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	icpx -O3  -march=native main.cpp -o main_i -static -L. -lftoa -fno-PIC
+	$(ICPX) -O3  -march=native main.cpp -o main_i -static -L. -lftoa -fno-PIC
 	sudo perf stat -d ./main_i
 
 go:
-	g++ -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(GCC) -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	g++ -O3  -march=native main.cpp -o main_g -static -L. -lftoa -fno-PIC
+	$(GCC) -O3  -march=native main.cpp -o main_g -static -L. -lftoa -fno-PIC
 	sudo perf stat -d ./main_g
 
 co:
-	clang++ -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(CLANG) -O3  -march=native -c -fno-PIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	clang++ -O3  -march=native main.cpp -o main_c -static -L. -lftoa -fno-PIC
+	$(CLANG) -O3  -march=native main.cpp -o main_c -static -L. -lftoa -fno-PIC
 	sudo perf stat -d ./main_c
 # /////////////////////////////
 
 # with PIC
 # /////////////////////////////
 ip:
-	icpx -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(ICPX) -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	icpx -O3  -march=native main.cpp -o main_i -static -L. -lftoa
+	$(ICPX) -O3  -march=native main.cpp -o main_i -static -L. -lftoa
 	sudo perf stat -d ./main_i
 
 gp:
-	g++ -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(GCC) -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	g++ -O3  -march=native main.cpp -o main_g -static -L. -lftoa
+	$(GCC) -O3  -march=native main.cpp -o main_g -static -L. -lftoa
 	sudo perf stat -d ./main_g
 
 cp:
-	clang++ -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
+	$(CLANG) -O3  -march=native -c -fPIC ../float_to_string/ftoa.cpp -o ftoa.o
 	ar rcs libftoa.a ftoa.o
-	clang++ -O3  -march=native main.cpp -o main_c -static -L. -lftoa
+	$(CLANG) -O3  -march=native main.cpp -o main_c -static -L. -lftoa
 	sudo perf stat -d ./main_c
 # /////////////////////////////
 
@@ -48,14 +52,14 @@ cp:
 # generate assembly code
 # /////////////////////////////
 s:
-	icpx    -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_icpx_no_PIC.s -march=native
-	clang++ -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_clang_no_PIC.s -march=native
-	g++     -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_gcc_no_PIC.s -march=native
+	$(ICPX)    -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_icpx_no_PIC.s -march=native
+	$(CLANG) -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_clang_no_PIC.s -march=native
+	$(GCC)     -O3  -std=c++20 -fno-PIC -S ../float_to_string/ftoa.cpp -o ftoa_gcc_no_PIC.s -march=native
 
 spic:
-	icpx    -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_icpx_PIC.s -march=native
-	clang++ -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_clang_PIC.s -march=native
-	g++     -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_gcc_PIC.s -march=native
+	$(ICPX)    -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_icpx_PIC.s -march=native
+	$(CLANG) -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_clang_PIC.s -march=native
+	$(GCC)     -O3  -std=c++20 -fPIC -S ../float_to_string/ftoa.cpp -o ftoa_gcc_PIC.s -march=native
 # /////////////////////////////
 
 
 
@@ -203,10 +203,14 @@ int main() {
 
     out << "# bits(hex)  d  k\n";
 
+    uint32_t d_min = 99999999;
+    uint32_t d_max = 0;
     // 遍历所有正 FP16 数值 (排除 0x0000, 0x7C00..0x7FFF)
     for (uint32_t bits = 0x0001; bits <= 0x7BFF; ++bits) {
         try {
             auto [d, k] = f16_to_decimal(static_cast<uint16_t>(bits));
+            d_min = d > d_min ? d_min : d;
+            d_max = d > d_max ? d : d_max;
             out << "0x" << std::hex << std::uppercase << bits << std::dec
                 << " " << (int64_t)d << " " << k << "\n";
         } catch (const std::invalid_argument&) {
@@ -220,5 +224,6 @@ int main() {
 
     out.close();
     std::cout << "Results written to f16_decimal_results.txt" << std::endl;
+    printf("d_min = %u, d_max = %u\n",d_min,d_max);
     return 0;
 }