ZhanruiSunCh · ZhanruiSunCh · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 10, 2025
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -40,7 +40,7 @@ jobs:
         startsWith(github.event.comment.body, '/bot skip --comment') ||
         startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
         startsWith(github.event.comment.body, '/bot kill')) && contains(
-        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jtchen0528","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar"]'),
+        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jtchen0528","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727"]'),
         github.actor)
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.github/workflows/l0-test.yml b/.github/workflows/l0-test.yml
@@ -16,6 +16,8 @@
 # A workflow to trigger ci on hybrid infra (github + self hosted runner)
 name: L0-Test
 on:
+  issue_comment:
+    types: [created]
   workflow_dispatch:
       inputs:
           sha:
@@ -28,6 +30,26 @@ on:
             description: 'test results url'
             required: true
 jobs:
+  Job-trigger:
+    name: Start ci job
+    if: |
+      startsWith(github.event.comment.body, '/bot run') ||
+      startsWith(github.event.comment.body, '/bot skip --comment') ||
+      startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
+      startsWith(github.event.comment.body, '/bot kill')
+    runs-on: [self-hosted, Linux, Jenkins]
+    steps:
+      - name: Start ci job
+        run: |
+          CI_SERVER="${{ secrets.CI_SERVER }}"
+          JENKINS_URL=$(echo "$CI_SERVER" | cut -d '@' -f 1)
+          TOKEN=$(echo "$CI_SERVER" | cut -d '@' -f 2)
+          sleep 100
+          echo '${{ toJson(github.event) }}' > githubData.json
+          curl -s -X POST \
+            -H "Content-Type: application/json" \
+            -d @githubData.json \
+            "$JENKINS_URL/generic-webhook-trigger/invoke?token=$TOKEN"
   Upload-Test:
     name: Upload test results
     runs-on: linux-amd64-cpu4

diff --git a/.github/workflows/precommit-check.yml b/.github/workflows/precommit-check.yml
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Release Checks
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: 'commit sha to check'
+        required: true
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+jobs:
+  precommit-check:
+    name: Pre-commit Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.ref }}
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Run pre-commit checks
+        run: |
+          python3 -u scripts/release_check.py
diff --git a/3rdparty/ucxx b/3rdparty/ucxx
diff --git a/README.md b/README.md
@@ -18,6 +18,9 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
+* [04/10] TensorRT-LLM DeepSeek R1 performance benchmarking best practices now published.
+✨ [➡️ link](./docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md)
+
 * [04/05] TensorRT-LLM can run Llama 4 at over 40,000 tokens per second on B200 GPUs!
 
 ![L4_perf](./docs/source/media/l4_launch_perf.png)

diff --git a/a.txt b/a.txt
diff --git a/b.txt b/b.txt
diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -118,7 +118,7 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::unique_ptr<DataRequester> mDataRequester;
     std::map<LlmRequest*, std::future<void>> mResponderFutures;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
-    mpi::MpiComm const *mMpiGroupComm{}, *mMpiWorldComm{};
+    mpi::MpiComm const *mMpiGroupComm{nullptr}, *mMpiWorldComm{nullptr};
     std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
         mMpiGroupTPInDPComm;
     executor::kv_cache::CommState const* mCommState;

diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
@@ -375,6 +375,7 @@ set(TRTLLM_LINK_LIBS
     trtllm_gen_fmha
     trtllm_gen_blockscale_gemm
     trtllm_gen_fp8_block_scale_moe
+    trtllm_gen_gemm
     selective_scan_src
     ws_layernorm_src
     fpA_intB_gemm_src

diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
@@ -140,20 +140,21 @@ __global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __res
     runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
 {
     int const bid = blockIdx.x; // Index of request in batch
-    float const diversityRate{diversityRates[batchSlots[bid]]};
+    runtime::SizeType32 const slot = batchSlots[bid];
+    float const diversityRate{diversityRates[slot]};
     T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
 
     for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
     {
         int const iBMIn = i / (nBMOut * 2);
-        if (finished[bid * nBMIn + iBMIn].isFinished())
+        if (finished[slot * nBMIn + iBMIn].isFinished())
         {
-            pLocalLogProbs[i] += (i == endIds[bid]) ? 1.0f : 0.0f;
+            pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
         }
         else
         {
             // nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
-            pLocalLogProbs[i] += cumLogProbs[bid * nBM + iBMIn] + diversityRate * iBMIn;
+            pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
         }
     }
     return;

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/CMakeLists.txt
@@ -15,6 +15,7 @@
 # the License.
 #
 
-add_subdirectory(fmha)
 add_subdirectory(blockscaleGemm)
+add_subdirectory(fmha)
 add_subdirectory(fp8BlockScaleMoe)
+add_subdirectory(gemm)
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockscaleGemm/kernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockscaleGemm/kernelRunner.cpp
@@ -76,6 +76,7 @@ struct TrtllmGenBlockScaleGemmOptions
 void TrtllmGenBlockScaleGemmRunner::run(int32_t m, int32_t n, int32_t k, void const* a, float const* aScale,
     void const* b, float const* bScale, void* c, float* cScale, CUstream stream)
 {
+
     TrtllmGenBlockScaleGemmOptions options;
     options.mM = m;
     options.mN = n;
@@ -98,10 +99,9 @@ void TrtllmGenBlockScaleGemmRunner::run(int32_t m, int32_t n, int32_t k, void co
     options.mSliceK = mKernelInfo->sliceK;
 
     auto params = TrtllmGenBlockScaleGemmKernelParams::setKernelParams(options, a, aScale, b, bScale, c,
-        nullptr /* multimemC */, cScale, nullptr /* ptrPartialSumsForSplitK */,
-        nullptr /* multimemPartialSumsForSplitK */, nullptr /* ptrTileBars */, nullptr /* multimemTileBars */,
-        nullptr /* ptrCompletionBars */, nullptr /* multimemCompletionBars */, nullptr /* ptrSplitKCompletionBars */, 0,
-        1);
+        nullptr /* ptrSfc */, nullptr /* multimemC */, cScale /* ptrScaleC */, nullptr /* ptrPartialSumsForSplitK */,
+        nullptr /* ptrTileBars */, nullptr /* multimemTileBars */, nullptr /* ptrCompletionBars */,
+        nullptr /* multimemCompletionBars */, nullptr /* ptrSplitKCompletionBars */, 0, 1);
     TLLM_CHECK_WITH_INFO(sizeof(params) == 832, "Size of mismatch between trtllm-gen and trtllm");
 
     CUlaunchConfig launch_config;

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt
@@ -0,0 +1,28 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+file(GLOB_RECURSE SRC_CPP *.cpp)
+file(GLOB_RECURSE SRC_CU *.cu)
+
+filter_cuda_archs("100" SRC_CPP)
+
+add_library(trtllm_gen_gemm OBJECT ${SRC_CPP} ${SRC_CU})
+
+target_compile_definitions(trtllm_gen_gemm PUBLIC TLLM_GEN_EXPORT_INTERFACE)
+
+set_property(TARGET trtllm_gen_gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET trtllm_gen_gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include "KernelRunner.h"
+#include "tensorrt_llm/common/assert.h"
+#include "trtllmGen_export/GemmInterface.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+TrtllmGenGemmRunner::TrtllmGenGemmRunner(tg::Dtype eltType, tg::Dtype outputType)
+    : mEltType(eltType)
+    , mOutputType(outputType)
+{
+    // Select a GEMM kernel config to use
+    auto const gemm = gemm::GemmInterface();
+    auto const configs = gemm.getGemmConfigs();
+
+    std::vector<int32_t> selectedIndex;
+
+    for (size_t i = 0; i < gemm.getNumGemmConfigs(); ++i)
+    {
+        auto const options = configs[i].mOptions;
+
+        // When we include low-latency kernels we can set transposeMmaOutput via constructor
+        if (options.mDtypeElt == eltType && options.mDtypeC == outputType && !options.mTransposeMmaOutput)
+        {
+            selectedIndex.push_back(i);
+        }
+    }
+
+    TLLM_CHECK_WITH_INFO(selectedIndex.size() != 0, "No kernel found for the given output type");
+    TLLM_CHECK_WITH_INFO(selectedIndex.size() == 1, "Multiple kernels found for the given output type");
+
+    mGemmConfig = &configs[selectedIndex[0]];
+}
+
+size_t TrtllmGenGemmRunner::getWorkspaceSizeInBytes(
+    int32_t m, int32_t n, int32_t k, tg::Dtype eltType, tg::Dtype outputType) const
+{
+    gemm::GemmData gemmData;
+    gemmData.mProblemDimensions.mM = m;
+    gemmData.mProblemDimensions.mN = n;
+    gemmData.mProblemDimensions.mK = k;
+
+    auto gemm = gemm::GemmInterface();
+
+    return gemm.getWorkspaceSizeInBytes(*mGemmConfig, gemmData);
+}
+
+void TrtllmGenGemmRunner::run(int32_t m, int32_t n, int32_t k, void const* a, float const* aScale, void const* b,
+    float const* bScale, void* c, float* cScale, void* workspace, CUstream stream, int device)
+{
+    auto gemm = gemm::GemmInterface();
+
+    gemm::GemmData gemmData;
+
+    // Dims
+    gemmData.mProblemDimensions.mM = m;
+    gemmData.mProblemDimensions.mN = n;
+    gemmData.mProblemDimensions.mK = k;
+
+    // Inputs
+    gemmData.mInputBuffers.mPtrA = a;
+    gemmData.mInputBuffers.mPtrSfA = aScale;
+    gemmData.mInputBuffers.mPtrB = b;
+    gemmData.mInputBuffers.mPtrSfB = bScale;
+    gemmData.mInputBuffers.mPtrScaleC = cScale;
+
+    // Outputs
+    gemmData.mOutputBuffers.mPtrC = c;
+
+    auto isValidConfig = gemm.isValidConfig(*mGemmConfig, gemmData);
+    TLLM_CHECK_WITH_INFO(isValidConfig, "Invalid GEMM config selected!");
+
+    cudaDeviceProp deviceProperties;
+    cudaGetDeviceProperties(&deviceProperties, device);
+
+    // FIXME once we start using all-reduce in the epilogue of the gemm this can be moved elsewhere
+    gemm.runInitBeforeWorldSync(*mGemmConfig, gemmData, static_cast<void*>(stream));
+
+    auto const err = gemm.run(*mGemmConfig, workspace, gemmData, static_cast<void*>(stream), deviceProperties);
+
+    TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!");
+}
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "trtllmGen_export/GemmOptions.h"
+#include "trtllmGen_export/trtllm/gen/DtypeDecl.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+namespace tg = trtllm::gen;
+
+class TrtllmGenGemmRunner
+{
+public:
+    explicit TrtllmGenGemmRunner(tg::Dtype eltType, tg::Dtype outputType);
+
+    [[nodiscard]] size_t getWorkspaceSizeInBytes(
+        int32_t m, int32_t n, int32_t k, tg::Dtype eltType, tg::Dtype outputType) const;
+
+    void run(int32_t m, int32_t n, int32_t k, void const* a, float const* aScale, void const* b, float const* bScale,
+        void* c, float* cScale, void* workspace, CUstream stream, int device);
+
+private:
+    tg::Dtype mEltType;
+    tg::Dtype mOutputType;
+    gemm::GemmConfig const* mGemmConfig;
+};
+} // namespace kernels
+} // namespace tensorrt_llm
+2 −0		.github/workflows/build.yaml
+15 −0		.github/workflows/pr.yaml
+1 −1		.github/workflows/trigger-breaking-change-alert.yaml
+2 −1		.pre-commit-config.yaml
+1 −1		README.md
+9 −9		conda/environments/all_cuda-118_arch-x86_64.yaml
+11 −11		conda/environments/all_cuda-128_arch-x86_64.yaml
+7 −7		conda/recipes/ucxx/conda_build_config.yaml
+9 −21		conda/recipes/ucxx/meta.yaml
+2 −3		cpp/CMakeLists.txt
+4 −3		cpp/include/ucxx/buffer.h
+1 −1		cpp/include/ucxx/delayed_submission.h
+3 −3		cpp/include/ucxx/endpoint.h
+7 −7		cpp/include/ucxx/request.h
+3 −0		cpp/include/ucxx/request_am.h
+1 −2		cpp/include/ucxx/request_tag_multi.h
+17 −2		cpp/include/ucxx/typedefs.h
+14 −5		cpp/include/ucxx/worker.h
+1 −0		cpp/python/src/exception.cpp
+1 −0		cpp/python/src/worker.cpp
+1 −0		cpp/src/config.cpp
+2 −0		cpp/src/context.cpp
+4 −4		cpp/src/delayed_submission.cpp
+2 −1		cpp/src/endpoint.cpp
+4 −2		cpp/src/internal/request_am.cpp
+2 −1		cpp/src/listener.cpp
+3 −3		cpp/src/memory_handle.cpp
+16 −4		cpp/src/remote_key.cpp
+15 −13		cpp/src/request_am.cpp
+1 −0		cpp/src/request_data.cpp
+0 −2		cpp/src/request_endpoint_close.cpp
+0 −2		cpp/src/request_flush.cpp
+4 −6		cpp/src/request_mem.cpp
+12 −8		cpp/src/request_stream.cpp
+5 −5		cpp/src/request_tag.cpp
+13 −12		cpp/src/request_tag_multi.cpp
+3 −3		cpp/src/utils/file_descriptor.cpp
+6 −3		cpp/src/utils/sockaddr.cpp
+12 −9		cpp/src/worker.cpp
+5 −0		cpp/tests/buffer.cpp
+1 −0		cpp/tests/context.cpp
+17 −12		cpp/tests/request.cpp
+9 −4		cpp/tests/worker.cpp
+31 −27		dependencies.yaml
+4 −4		python/distributed-ucxx/pyproject.toml
+2 −2		python/libucxx/pyproject.toml
+9 −9		python/ucxx/pyproject.toml
+16 −3		python/ucxx/ucxx/_lib/libucxx.pyx
+9 −3		python/ucxx/ucxx/_lib/ucxx_api.pxd