3434#elif defined(__GNUC__ ) || defined(__clang__ )
3535#define SIMSIMD_DYNAMIC __attribute__((visibility("default")))
3636#define SIMSIMD_PUBLIC __attribute__((unused)) inline static
37- #define SIMSIMD_INTERNAL __attribute__((always_inline)) inline static
37+ #define SIMSIMD_INTERNAL inline static
3838#else
3939#define SIMSIMD_DYNAMIC
4040#define SIMSIMD_PUBLIC inline static
@@ -436,16 +436,20 @@ SIMSIMD_STATIC_ASSERT(sizeof(simsimd_bf16_t) == 2, simsimd_bf16_t_must_be_2_byte
436436#endif
437437
438438#if !defined(SIMSIMD_F32_TO_I8 )
439- #define SIMSIMD_F32_TO_I8 (x , y ) *(y) = (simsimd_i8_t)fminf(fmaxf(roundf(x), -128), 127)
439+ #define SIMSIMD_F32_TO_I8 (x , y ) \
440+ *(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f))))
440441#endif
441442#if !defined(SIMSIMD_F32_TO_U8 )
442- #define SIMSIMD_F32_TO_U8 (x , y ) *(y) = (simsimd_u8_t)fminf(fmaxf(roundf(x), 0), 255)
443+ #define SIMSIMD_F32_TO_U8 (x , y ) \
444+ *(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f))))
443445#endif
444446#if !defined(SIMSIMD_F64_TO_I8 )
445- #define SIMSIMD_F64_TO_I8 (x , y ) *(y) = (simsimd_i8_t)fmin(fmax(round(x), -128), 127)
447+ #define SIMSIMD_F64_TO_I8 (x , y ) \
448+ *(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5 : 0.5))))
446449#endif
447450#if !defined(SIMSIMD_F64_TO_U8 )
448- #define SIMSIMD_F64_TO_U8 (x , y ) *(y) = (simsimd_u8_t)fmin(fmax(round(x), 0), 255)
451+ #define SIMSIMD_F64_TO_U8 (x , y ) \
452+ *(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5 : 0.5))))
449453#endif
450454
451455/** @brief Convenience type for half-precision floating-point type conversions. */
@@ -467,20 +471,33 @@ typedef union {
467471 * https://web.archive.org/web/20210208132927/http://assemblyrequired.crashworks.org/timing-square-root/
468472 * https://stackoverflow.com/a/41460625/2766161
469473 */
470- SIMSIMD_PUBLIC simsimd_f32_t simsimd_approximate_inverse_square_root (simsimd_f32_t number ) {
474+ SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_inverse_square_root (simsimd_f32_t number ) {
471475 simsimd_f32i32_t conv ;
472476 conv .f = number ;
473477 conv .i = 0x5F1FFFF9 - (conv .i >> 1 );
478+ // Refine using a Newton-Raphson step for better accuracy
474479 conv .f *= 0.703952253f * (2.38924456f - number * conv .f * conv .f );
475480 return conv .f ;
476481}
477482
483+ /**
484+ * @brief Approximates `sqrt(x)` using the fast inverse square root trick
485+ * with adjustments for direct square root approximation.
486+ *
487+ * Similar to `rsqrt` approximation but multiplies by `number` to get `sqrt`.
488+ * This technique is useful where `sqrt` approximation is needed in performance-critical code,
489+ * though modern hardware provides optimized alternatives.
490+ */
491+ SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_square_root (simsimd_f32_t number ) {
492+ return number * simsimd_approximate_inverse_square_root (number );
493+ }
494+
478495/**
479496 * @brief Computes `log(x)` using the Mercator series.
480497 * The series converges to the natural logarithm for args between -1 and 1.
481498 * Published in 1668 in "Logarithmotechnia".
482499 */
483- SIMSIMD_PUBLIC simsimd_f32_t simsimd_approximate_log (simsimd_f32_t number ) {
500+ SIMSIMD_INTERNAL simsimd_f32_t simsimd_approximate_log (simsimd_f32_t number ) {
484501 simsimd_f32_t x = number - 1 ;
485502 simsimd_f32_t x2 = x * x ;
486503 simsimd_f32_t x3 = x * x * x ;
@@ -497,7 +514,7 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_approximate_log(simsimd_f32_t number) {
497514 * https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
498515 * https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
499516 */
500- SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32 (simsimd_f16_t const * x_ptr ) {
517+ SIMSIMD_INTERNAL simsimd_f32_t simsimd_f16_to_f32 (simsimd_f16_t const * x_ptr ) {
501518 unsigned short x = * (unsigned short const * )x_ptr ;
502519 unsigned int exponent = (x & 0x7C00 ) >> 10 ;
503520 unsigned int mantissa = (x & 0x03FF ) << 13 ;
@@ -519,7 +536,7 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) {
519536 * https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233
520537 * https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834
521538 */
522- SIMSIMD_PUBLIC void simsimd_f32_to_f16 (simsimd_f32_t x , simsimd_f16_t * result_ptr ) {
539+ SIMSIMD_INTERNAL void simsimd_f32_to_f16 (simsimd_f32_t x , simsimd_f16_t * result_ptr ) {
523540 simsimd_f32i32_t conv ;
524541 conv .f = x ;
525542 unsigned int b = conv .i + 0x00001000 ;
@@ -538,7 +555,7 @@ SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_pt
538555 * https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
539556 * https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
540557 */
541- SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32 (simsimd_bf16_t const * x_ptr ) {
558+ SIMSIMD_INTERNAL simsimd_f32_t simsimd_bf16_to_f32 (simsimd_bf16_t const * x_ptr ) {
542559 unsigned short x = * (unsigned short const * )x_ptr ;
543560 simsimd_f32i32_t conv ;
544561 conv .i = x << 16 ; // Zero extends the mantissa
@@ -551,7 +568,7 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) {
551568 * https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307
552569 * https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus
553570 */
554- SIMSIMD_PUBLIC void simsimd_f32_to_bf16 (simsimd_f32_t x , simsimd_bf16_t * result_ptr ) {
571+ SIMSIMD_INTERNAL void simsimd_f32_to_bf16 (simsimd_f32_t x , simsimd_bf16_t * result_ptr ) {
555572 simsimd_f32i32_t conv ;
556573 conv .f = x ;
557574 conv .i += 0x8000 ; // Rounding is optional
@@ -561,12 +578,12 @@ SIMSIMD_PUBLIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_
561578 * (unsigned short * )result_ptr = (unsigned short )conv .i ;
562579}
563580
564- SIMSIMD_PUBLIC simsimd_u32_t simsimd_u32_rol (simsimd_u32_t x , int n ) { return (x << n ) | (x >> (32 - n )); }
565- SIMSIMD_PUBLIC simsimd_u16_t simsimd_u16_rol (simsimd_u16_t x , int n ) { return (x << n ) | (x >> (16 - n )); }
566- SIMSIMD_PUBLIC simsimd_u8_t simsimd_u8_rol (simsimd_u8_t x , int n ) { return (x << n ) | (x >> (8 - n )); }
567- SIMSIMD_PUBLIC simsimd_u32_t simsimd_u32_ror (simsimd_u32_t x , int n ) { return (x >> n ) | (x << (32 - n )); }
568- SIMSIMD_PUBLIC simsimd_u16_t simsimd_u16_ror (simsimd_u16_t x , int n ) { return (x >> n ) | (x << (16 - n )); }
569- SIMSIMD_PUBLIC simsimd_u8_t simsimd_u8_ror (simsimd_u8_t x , int n ) { return (x >> n ) | (x << (8 - n )); }
581+ SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_rol (simsimd_u32_t x , int n ) { return (x << n ) | (x >> (32 - n )); }
582+ SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_rol (simsimd_u16_t x , int n ) { return (x << n ) | (x >> (16 - n )); }
583+ SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_rol (simsimd_u8_t x , int n ) { return (x << n ) | (x >> (8 - n )); }
584+ SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_ror (simsimd_u32_t x , int n ) { return (x >> n ) | (x << (32 - n )); }
585+ SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_ror (simsimd_u16_t x , int n ) { return (x >> n ) | (x << (16 - n )); }
586+ SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_ror (simsimd_u8_t x , int n ) { return (x >> n ) | (x << (8 - n )); }
570587
571588#ifdef __cplusplus
572589} // extern "C"
0 commit comments