diff --git a/BUILD.bazel b/BUILD.bazel index ead35a0c524..35aebfb98bb 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -528,6 +528,7 @@ xnnpack_cc_library( ":microparams_h", ":operator_h", ":xnnpack_h", + "@FXdiv", ], ) diff --git a/BUILD.gn b/BUILD.gn index 26b9901b995..5297ffae0f7 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -918,6 +918,7 @@ xnnpack_source_set("scalar_microkernels") { deps = [ ":microkernel_defs", ":microkernel_headers", + "//third_party/fxdiv", ] sources = ALL_SCALAR_MICROKERNEL_SRCS } @@ -965,6 +966,7 @@ xnnpack_source_set("xnnpack") { ":scalar_microkernels", ":subgraph", ":table", + "//third_party/fxdiv", ] if (xnnpack_enable_arm_kleidiai) { deps += [ "//third_party/kleidiai" ] diff --git a/CMakeLists.txt b/CMakeLists.txt index a667fb46f6a..58884784f1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -445,7 +445,16 @@ IF(NOT XNNPACK_USE_SYSTEM_LIBS) EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build . WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/cpuinfo-download") SET(CPUINFO_SOURCE_DIR "${CMAKE_BINARY_DIR}/cpuinfo-source" CACHE STRING "cpuinfo source directory") + ENDIF() + IF(NOT DEFINED FXDIV_SOURCE_DIR) + MESSAGE(STATUS "Downloading FXdiv to ${CMAKE_BINARY_DIR}/FXdiv-source (define FXDIV_SOURCE_DIR to avoid it)") + CONFIGURE_FILE(cmake/DownloadFXdiv.cmake "${CMAKE_BINARY_DIR}/FXdiv-download/CMakeLists.txt") + EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/FXdiv-download") + EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build . + WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/FXdiv-download") + SET(FXDIV_SOURCE_DIR "${CMAKE_BINARY_DIR}/FXdiv-source" CACHE STRING "FXdiv source directory") ENDIF() IF(NOT DEFINED PTHREADPOOL_SOURCE_DIR) @@ -1300,7 +1309,32 @@ IF(NOT TARGET pthreadpool) ENDIF() TARGET_LINK_LIBRARIES(xnnpack-base INTERFACE pthreadpool) - +# ---[ Configure FXdiv +IF(NOT TARGET fxdiv) + IF(NOT XNNPACK_USE_SYSTEM_LIBS) + SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "") + SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") + ADD_SUBDIRECTORY( + "${FXDIV_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/FXdiv") + ELSE() + FIND_FILE(FXDIV_HDR fxdiv.h PATH_SUFFIXES include PATHS "${FXDIV_SOURCE_DIR}") + IF(NOT FXDIV_HDR) + MESSAGE(FATAL_ERROR "Cannot find fxdiv") + ENDIF() + ADD_LIBRARY(fxdiv STATIC "${FXDIV_HDR}") + TARGET_INCLUDE_DIRECTORIES(fxdiv INTERFACE "${FXDIV_SOURCE_DIR}/include") + SET_PROPERTY(TARGET fxdiv PROPERTY LINKER_LANGUAGE C) + ENDIF() +ENDIF() +IF(XNNPACK_BUILD_ALL_MICROKERNELS) + TARGET_LINK_LIBRARIES(xnnpack-microkernels-all PRIVATE fxdiv) +ENDIF() +TARGET_LINK_LIBRARIES(xnnpack-microkernels-prod PRIVATE fxdiv) +TARGET_LINK_LIBRARIES(xnnpack-indirection PRIVATE fxdiv) +IF(XNNPACK_BUILD_LIBRARY) + TARGET_LINK_LIBRARIES(XNNPACK PRIVATE fxdiv) +ENDIF() IF(XNNPACK_BUILD_LIBRARY) INSTALL(TARGETS XNNPACK xnnpack-microkernels-prod diff --git a/MODULE.bazel b/MODULE.bazel index c5580b78e88..f6dc7016495 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -64,6 +64,8 @@ http_archive( urls = ["https://github.com/google/benchmark/archive/7da00e8f6763d6e8c284d172c9cfcc5ae0ce9b7a.zip"], ) # LINT.ThenChange(cmake/DownloadGoogleBenchmark.cmake) + +# LINT.IfChange(FXdiv) # FXdiv library, used for repeated integer division by the same factor http_archive( name = "FXdiv", @@ -71,8 +73,7 @@ http_archive( strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1", urls = ["https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip"], ) - - +# LINT.ThenChange(cmake/DownloadFXdiv.cmake) # LINT.IfChange(pthreadpool) # pthreadpool library, used for parallelization diff --git a/build_params.bzl b/build_params.bzl index a85a92f33b7..f5e7dbb5320 100644 --- a/build_params.bzl +++ b/build_params.bzl @@ -212,6 +212,7 @@ XNNPACK_PARAMS_FOR_ARCH = { ], extra_deps = [ "//src/configs:config_hdrs", + "@FXdiv", ], ), diff --git a/cmake/DownloadFXdiv.cmake b/cmake/DownloadFXdiv.cmake new file mode 100644 index 00000000000..ba97ffe54bf --- /dev/null +++ b/cmake/DownloadFXdiv.cmake @@ -0,0 +1,30 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) + +PROJECT(fxdiv-download NONE) + +# Set file timestamps to the time of extraction. +IF(POLICY CMP0135) + CMAKE_POLICY(SET CMP0135 NEW) +ENDIF() + +# LINT.IfChange +INCLUDE(ExternalProject) +ExternalProject_Add(fxdiv + URL https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip + URL_HASH SHA256=ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d + SOURCE_DIR "${CMAKE_BINARY_DIR}/FXdiv-source" + BINARY_DIR "${CMAKE_BINARY_DIR}/FXdiv" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) +# LINT.ThenChange(../MODULE.bazel:FXdiv) diff --git a/src/indirection.c b/src/indirection.c index 95f54d4d36c..4baeb50cc62 100644 --- a/src/indirection.c +++ b/src/indirection.c @@ -9,6 +9,7 @@ #include "src/xnnpack/indirection.h" #include +#include #include #include #include @@ -40,25 +41,17 @@ void xnn_indirection_init_conv2d( size_t input_padding_top, size_t input_padding_left) { - assert(output_height != 0); - assert(output_width != 0); const size_t output_size = output_height * output_width; const size_t kernel_size = kernel_height * kernel_width; - size_t cur_oy = output_start / output_width; - size_t cur_ox = output_start % output_width; - const size_t clamp_oy = (output_size - 1) / output_width; - const size_t clamp_ox = (output_size - 1) % output_width; + const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); for (size_t output_tile_start = output_start; output_tile_start < output_end; output_tile_start += output_tile_size) { - size_t oy = cur_oy; - size_t ox = cur_ox; for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { - const size_t output_index = output_tile_start + output_tile_offset; - const bool clamped = output_index >= output_size - 1; - const size_t output_x = clamped ? clamp_ox : ox; - const size_t output_y = clamped ? clamp_oy : oy; - + const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); + const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); + const size_t output_x = output_y_x.remainder; + const size_t output_y = output_y_x.quotient; for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; if (input_y < input_height) { @@ -81,17 +74,7 @@ void xnn_indirection_init_conv2d( } } } - - if (output_index < output_size - 1) { - ox++; - if (ox == output_width) { - ox = 0; - oy++; - } - } } - cur_oy = oy; - cur_ox = ox; } } @@ -118,26 +101,22 @@ void xnn_indirection_init_deconv2d( const size_t tiled_output_size = round_up(output_size, output_tile_size); const size_t kernel_size = kernel_height * kernel_width; - size_t cur_oy = 0; - size_t cur_ox = 0; - const size_t clamp_oy = (output_size - 1) / output_width; - const size_t clamp_ox = (output_size - 1) % output_width; + const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); + const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height); + const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width); for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { - size_t oy = cur_oy; - size_t ox = cur_ox; for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { - const size_t output_index = output_tile_start + output_tile_offset; - const bool clamped = output_index >= output_size - 1; - const size_t output_x = clamped ? clamp_ox : ox; - const size_t output_y = clamped ? clamp_oy : oy; - + const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); + const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); + const size_t output_x = output_y_x.remainder; + const size_t output_y = output_y_x.quotient; for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { const size_t y = output_y + padding_top - kernel_y * dilation_height; - const size_t input_y = y / stride_height; + const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor); for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { const size_t x = output_x + padding_left - kernel_x * dilation_width; - const size_t input_x = x / stride_width; + const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor); const size_t kernel_index = kernel_y * kernel_width + kernel_x; const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) { @@ -147,17 +126,7 @@ void xnn_indirection_init_deconv2d( } } } - - if (output_index < output_size - 1) { - ox++; - if (ox == output_width) { - ox = 0; - oy++; - } - } } - cur_oy = oy; - cur_ox = ox; } } diff --git a/src/u8-lut32norm/u8-lut32norm-scalar.c b/src/u8-lut32norm/u8-lut32norm-scalar.c index fa4c2d34108..929a388d4ea 100644 --- a/src/u8-lut32norm/u8-lut32norm-scalar.c +++ b/src/u8-lut32norm/u8-lut32norm-scalar.c @@ -8,6 +8,8 @@ #include +#include + #include "src/xnnpack/lut.h" @@ -37,11 +39,12 @@ void xnn_u8_lut32norm_ukernel__scalar( const uint32_t vsum = compute_sum(n, x, t); assert(vsum != 0); + struct fxdiv_divisor_uint32_t vsum_divisor = fxdiv_init_uint32_t(vsum); const uint32_t vrounding = (vsum >> 1); do { const size_t vx = *x++; const uint32_t vt = t[vx]; - const uint32_t vq = ((vt << 8) + vrounding) / vsum; + const uint32_t vq = fxdiv_quotient_uint32_t((vt << 8) + vrounding, vsum_divisor); const uint8_t vy = vq > 255 ? UINT8_C(255) : (uint8_t) vq; *y++ = vy; } while (--n != 0);