stepfun-ai
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fserver/csrc/public.hpp‎
Lines changed: 24 additions & 13 deletions b/‎fserver/csrc/public.hpp‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎fserver/csrc/util.h‎
Lines changed: 2 additions & 0 deletions b/‎fserver/csrc/util.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/dmlc/logging.h‎
Lines changed: 1 addition & 1 deletion b/‎include/dmlc/logging.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ps/af_tensor_app.h‎
Lines changed: 13 additions & 7 deletions b/‎include/ps/af_tensor_app.h‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎include/ps/internal/customer.h‎
Lines changed: 2 additions & 1 deletion b/‎include/ps/internal/customer.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/ps/internal/message.h‎
Lines changed: 2 additions & 1 deletion b/‎include/ps/internal/message.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/ps/internal/threadsafe_queue.h‎
Lines changed: 10 additions & 9 deletions b/‎include/ps/internal/threadsafe_queue.h‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎include/ps/internal/utils.h‎
Lines changed: 17 additions & 4 deletions b/‎include/ps/internal/utils.h‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎include/ps/kv_app.h‎
Lines changed: 4 additions & 1 deletion b/‎include/ps/kv_app.h‎
Lines changed: 4 additions & 1 deletion
@@ -14,6 +14,7 @@ message("MY PYTHON_EXECUTABLE ${Python_EXECUTABLE}")
 message("MY PYTORCH_CMAKE_PREFIX_PATH ${PYTORCH_CMAKE_PREFIX_PATH}")
 
 list(APPEND CMAKE_PREFIX_PATH "${PYTORCH_CMAKE_PREFIX_PATH}/Torch")
+
 find_package(Torch REQUIRED CONFIG)
 message("MY TORCH_INCLUDE_DIRS ${TORCH_INCLUDE_DIRS}")
 message("MY CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}")
@@ -27,7 +28,7 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 if("$ENV{USE_CUDA}" STREQUAL "0")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_ZMQ -DDMLC_USE_RDMA -DSTEPMESH_USE_GDR")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_ZMQ -DDMLC_USE_CUDA -DSTEPMESH_USE_GDR -DDMLC_USE_RDMA")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_ZMQ -DDMLC_USE_CUDA -DSTEPMESH_USE_GDR -DDMLC_USE_RDMA -DSTEPMESH_ENABLE_TRACE")
 endif()
 
 link_directories("${PROJECT_SOURCE_DIR}/deps/lib")
 
@@ -100,7 +100,7 @@ void respond(std::vector<torch::Tensor>& tensors,
   PS_CHECK_EQ(tensors.size(), reqmeta.pull_tensors.size());
   std::vector<KeyTensor> result;
   for (size_t i = 0; i < tensors.size(); ++i) {
-    result.push_back({reqmeta.pull_tensors[i].key, std::move(tensors[i])});
+    result.push_back({reqmeta.pull_tensors[i].key, std::move(tensors[i].detach())});
   }
   fserver_->Response(reqmeta, result, need_event);
 }
@@ -130,19 +130,19 @@ int push_pull(std::vector<torch::Tensor>& push_tensors,
   auto pull_batch = KeyTensorBatch(pull_tensors.size());
   for (size_t i = 0; i < push_tensors.size(); i++) {
     push_batch[i] = KeyTensor{
-        static_cast<uint64_t>(push_keys[i]), std::move(push_tensors[i])
+        static_cast<uint64_t>(push_keys[i]), std::move(push_tensors[i].detach())
     };
   }
   for (size_t i = 0; i < pull_tensors.size(); i++) {
     pull_batch[i] = KeyTensor{
-        static_cast<uint64_t>(pull_keys[i]), std::move(pull_tensors[i])
+        static_cast<uint64_t>(pull_keys[i]), std::move(pull_tensors[i].detach())
     };
   }
   return fworker_->ZBatchPushPull(push_batch, pull_batch);
 }
 
-void wait(int handler) {
-  fworker_->Wait(handler);
+void wait(int handler, uint64_t timeout_ms = 1000) {
+  fworker_->Wait(handler, timeout_ms);
 }
 
 void barrier(bool include_server, bool include_worker, bool instrance_barrier=true) {
@@ -163,26 +163,29 @@ void barrier(bool include_server, bool include_worker, bool instrance_barrier=tr
   }
 }
 
+
 void init() {
+
   std::string role_str = ps::GetEnv("DMLC_ROLE", "server");
+  int offset = 0;
   role_ = ps::GetRole(role_str);
 
   ps::Environment::Get()->find("STEPMESH_GPU", &gpu_, gpu_);
   ps::Environment::Get()->find("DMLC_GROUP_SIZE", &group_size_, group_size_);
   ps::Environment::Get()->find("DMLC_NODE_RANK", &node_rank_, node_rank_);
-  ps::Environment::Get()->find("DMLC_INSTANCE_ID", &instance_id_, gpu_);
+  ps::Environment::Get()->find("DMLC_RANK_OFFSET", &offset, offset);
+  ps::Environment::Get()->find("DMLC_INSTANCE_ID", &instance_id_, gpu_ + offset);
   ps::Environment::Get()->find("DMLC_NUM_WORKER", &num_worker_, num_worker_);
-
+  
   worker_mask_ = (1 << num_worker_) - 1;
   q_.resize(num_worker_);
   q_signal_.store(0);;
-
-  ps::StartPS(0, role_,  group_size_ * node_rank_ + gpu_, true);
+  ps::StartPS(0, role_,  group_size_ * node_rank_ + gpu_ + offset, true);
   if (role_ == Node::WORKER) {
-    fworker_ = new AFTensorWorker(instance_id_);
+    fworker_ = new AFTensorWorker(instance_id_ );
     barrier(true, true);
   } else if (role_ == Node::SERVER) {
-    fserver_ = new AFTensorServer(instance_id_);
+    fserver_ = new AFTensorServer(instance_id_ );
     fserver_->SetRequestHandle(RequestHandler);
     ps::RegisterExitCallback([]() { delete fserver_; });
     barrier(true, true);
@@ -242,8 +245,16 @@ void pybind_public(py::module &m){
         py::call_guard<py::gil_scoped_release>());
 
   // APIs for Attention Instances
-  m.def("push_pull", &push_pull, py::call_guard<py::none>());
-  m.def("wait", &wait, py::call_guard<py::none>());
+  m.def("push_pull", &push_pull, 
+    py::arg("push_tensors"),
+    py::arg("push_keys"),
+    py::arg("pull_tensors"),
+    py::arg("pull_keys"),
+    py::call_guard<py::none>());
+  m.def("wait", &wait, 
+    py::arg("handler"),
+    py::arg("timeout_ms") = 10000,
+    py::call_guard<py::none>());
 
   // APIs for FFN Instances
   m.def("get_batch", &get_batch, py::call_guard<py::none>());
 
@@ -19,6 +19,8 @@
 #include "ps/ps.h"
 
 #ifndef UTIL_H_
+typedef std::tuple<uint64_t, std::vector<torch::Tensor>, std::vector<uint64_t>>
+    ServerDataBatch;
 #define  UTIL_H_
 typedef std::tuple<uint64_t, std::vector<torch::Tensor>, std::vector<uint64_t>>
     ServerDataBatch;
 
@@ -190,7 +190,7 @@ class LogMessage {
 #endif
   {
     log_stream_ << "[" << pretty_date_.HumanDate() << "] "
-                << getenv("DMLC_ROLE") << " " << file << ":" << line << ": ";
+                << getenv("DMLC_ROLE") << " " << getenv("STEPMESH_GPU") << " " << file << ":" << std::dec << line << ": ";
   }
   ~LogMessage() { log_stream_ << "\n"; }
   std::ostream &stream() { return log_stream_; }
 
@@ -133,6 +133,8 @@ class AFTensorWorker {
     req.event = GetEvent();
     req.event->Record();
 
+    PS_VLOG(3) << "ts" << start_ts << " pushpull_queue_ push "
+               << pushpull_queue_.Size();
     pushpull_queue_.Push(std::move(req));
 
     // std::unique_lock<std::mutex> timestamp_lock(timestamp_mu_);
@@ -144,13 +146,13 @@ class AFTensorWorker {
    * \brief Wait for the operation to complete
    * @param timestamp return by push, pull or push-pull operations
    */
-  void Wait(int timestamp) {
-    kv_.Wait(timestamp);
+  void Wait(int timestamp, uint64_t timeout_ms = 10000) {
+    kv_.Wait(timestamp, timeout_ms);
     // std::unique_lock<std::mutex> lock(timestamp_mu_);
     auto itr = batch_timestamps_.find(timestamp);
     if (itr != batch_timestamps_.end()) {
       for (auto ts : itr->second) {
-        kv_.Wait(ts);
+        kv_.Wait(ts, timeout_ms);
       }
       batch_timestamps_.erase(itr);
     }
@@ -199,15 +201,15 @@ class AFTensorWorker {
   }
 
   void PushPullWorker() {
-    BindCpuCore(4, 1);
+    BindCpuCore(3, 1);
     Backend::Get()->SetDevice(gpu_);
-    while (!pushpull_stop_.load()) {
+    while (true) {
+      PS_VLOG(4) << "pushpull_queue_ Loop wait ";
       AFTensorRequest req;
-      pushpull_queue_.WaitAndPop(&req);
-
       if (pushpull_stop_.load()) {
         break;
       }
+      pushpull_queue_.WaitAndPop(&req, true);
 
       if (req.event != nullptr) {
         req.event->Sync();
@@ -216,6 +218,8 @@ class AFTensorWorker {
       }
       ZBatchPushPull_(req.push, req.push_timestamps, req.pull,
                       req.pull_timestamps);
+      PS_VLOG(4) << "pushpull_queue_ Loop done " << req.push_timestamps[0]
+                 << " " << req.pull_timestamps[0];
     }
     PS_LOG(INFO) << "Stop PushPullWorker" << gpu_;
   }
@@ -233,6 +237,8 @@ class AFTensorWorker {
     msg.meta.timestamp = ts;
     msg.meta.addr = reinterpret_cast<uint64_t>(tensor.data_ptr());
     msg.meta.val_len = tensor.numel() * tensor.itemsize();
+    PS_VLOG(2) << "ZPush_ addr: 0x" << std::hex << msg.meta.addr << std::dec
+               << " val_len: " << msg.meta.val_len;
     msg.meta.key = keys[0];
     msg.meta.is_tensor = 1;
     msg.meta.dtype = static_cast<int>(tensor.scalar_type());
 
@@ -32,6 +32,7 @@ struct CustomerTracker {
   std::atomic<int> response_count;
   struct Trace request;
   struct Trace response;
+  uint64_t start_time;
 };
 
 class Customer {
@@ -80,7 +81,7 @@ class Customer {
    * \brief wait until the request is finished. threadsafe
    * \param timestamp the timestamp of the request
    */
-  void WaitRequest(int timestamp);
+  void WaitRequest(int timestamp, uint64_t timeout_ms = 10000);
 
   /**
    * \brief return the number of responses received for the request. threadsafe
 
@@ -293,7 +293,7 @@ struct Meta {
       ss << " }";
     }
     if (!control.empty() || simple_app) ss << ". NOT DATA MSG!";
-    ss << "Slave QP Count: " << slave_qp_num;
+    ss << ", Slave QP Count: " << slave_qp_num;
     return ss.str();
   }
   /** \brief an int head */
@@ -384,6 +384,7 @@ struct Message {
       meta.dst_dev_id = val.dst_device_id_;
     }
   }
+
   std::string DebugString() const {
     std::stringstream ss;
     ss << meta.DebugString();
 
@@ -10,6 +10,7 @@
 #include <queue>
 #include <utility>
 
+#include "dmlc/logging.h"
 #include "ps/base.h"
 #include "ps/internal/env.h"
 #include "ps/internal/spsc_queue.h"
@@ -34,10 +35,10 @@ class ThreadsafeQueue {
    * \brief push an value into the end. threadsafe.
    * \param new_value the value
    */
-  inline void Push(T new_value) {
+  inline void Push(T new_value, bool print_log = false) {
     if (lockless_) {
       // PushLockless(std::move(new_value));
-      PushAtomic(std::move(new_value));
+      PushAtomic(std::move(new_value), print_log);
       return;
     }
     {
@@ -51,10 +52,10 @@ class ThreadsafeQueue {
    * \brief wait until pop an element from the beginning, threadsafe
    * \param value the poped value
    */
-  inline void WaitAndPop(T* value) {
+  inline void WaitAndPop(T* value, bool print_log = false) {
     if (lockless_) {
       // WaitAndPopLockless(value);
-      WaitAndPopAtomic(value);
+      WaitAndPopAtomic(value, print_log);
       return;
     }
     std::unique_lock<std::mutex> lk(mu_);
@@ -96,7 +97,7 @@ class ThreadsafeQueue {
     }
   }
 
-  void PushAtomic(T new_value) {
+  void PushAtomic(T new_value, bool print_log = false) {
     const size_t current_tail = tail_.load(std::memory_order_relaxed);
     const size_t next_tail = (current_tail + 1) % capacity_;
     while (next_tail == head_.load(std::memory_order_acquire)) {
@@ -112,19 +113,19 @@ class ThreadsafeQueue {
     return;
   }
 
-  void WaitAndPopAtomic(T* value) {
-    const size_t current_head = head_.load(std::memory_order_relaxed);
-
+  void WaitAndPopAtomic(T* value, bool print_log = false) {
+    size_t current_head = head_.load(std::memory_order_relaxed);
     // Check if the queue is empty
     // acquire: ensures writes preceding this load in other threads are
     // visible. Specifically, ensures the producer's writes to 'tail_' are
     // visible.
-    int max_count = 1000;
+    int max_count = 5000;
     int count = 0;
     while (current_head == tail_.load(std::memory_order_acquire)) {
       // Queue is empty, spin and yield
       count++;
       if (count > max_count) {
+        current_head = head_.load(std::memory_order_relaxed);
         count = 0;
         // _mm_pause();
       }
 
@@ -6,6 +6,7 @@
 #define PS_INTERNAL_UTILS_H_
 
 #include <ctype.h>
+#include <execinfo.h>
 #include <pthread.h>
 #include <sched.h>
 #include <stdio.h>
@@ -109,25 +110,37 @@ static uint64_t norm = CycleToNs();
 /*!
  * \brief Get the current nanocount.
  */
-static inline uint64_t GetNanosecond() {
+static inline uint64_t GetNanosecond(bool return_zero = true) {
 #ifdef STEPMESH_ENABLE_TRACE
+  return_zero = false;
+#endif
+  if (return_zero) {
+    return 0;
+  }
   if (norm == 0) {
     norm = CycleToNs();
   }
   return static_cast<uint64_t>((_GetCurrentCycle() << 5) / norm);
-#else
-  return 0;
-#endif
 }
 
 static int PS_VERBOSE = ps::GetEnv("PS_VERBOSE", 0);
 
+/**
+ * @brief Rename Thread
+ *
+ */
+
+static inline void RenameThread(const std::string &name) {
+  pthread_setname_np(pthread_self(), name.c_str());
+}
+
 /**
  * \brief Bind current thread to a specific CPU core.
  * \param offset is the start of the core id
  * \param core_count is the number of cores the thread need.
  */
 static inline void BindCpuCore(int offset, int core_count = 1) {
+  RenameThread("StepMesh: BindCpuCore");
   int gpu = -1;
   Environment::Get()->find("STEPMESH_GPU", &gpu, gpu);
   int bind_enable = 0;
 
@@ -90,6 +90,7 @@ class KVWorker : public SimpleApp {
    */
   explicit KVWorker(int app_id, int customer_id, int instance_idx = 0)
       : SimpleApp() {
+    printf("KVWorker instance_idx,%d\n", instance_idx);
     postoffice_ = Postoffice::GetWorker(instance_idx);
     PS_VLOG(3) << "KVWorker " << instance_idx << " po@"
                << reinterpret_cast<uint64_t>(postoffice_);
@@ -207,7 +208,9 @@ class KVWorker : public SimpleApp {
    *
    * \param timestamp the timestamp returned by the push or pull
    */
-  void Wait(int timestamp) { obj_->WaitRequest(timestamp); }
+  void Wait(int timestamp, uint64_t timeout_ms = 10000) {
+    obj_->WaitRequest(timestamp, timeout_ms);
+  }
 
   /**
    * \brief zero-copy Push
Original file line number	Diff line number	Diff line change
`@@ -190,7 +190,7 @@ class LogMessage {`
`190`	`190`	`#endif`
`191`	`191`	`{`
`192`	`192`	`log_stream_ << "[" << pretty_date_.HumanDate() << "] "`
`193`		`- << getenv("DMLC_ROLE") << " " << file << ":" << line << ": ";`
	`193`	`+ << getenv("DMLC_ROLE") << " " << getenv("STEPMESH_GPU") << " " << file << ":" << std::dec << line << ": ";`
`194`	`194`	`}`
`195`	`195`	`~LogMessage() { log_stream_ << "\n"; }`
`196`	`196`	`std::ostream &stream() { return log_stream_; }`
Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ struct Meta {`
`293`	`293`	`ss << " }";`
`294`	`294`	`}`
`295`	`295`	`if (!control.empty() \|\| simple_app) ss << ". NOT DATA MSG!";`
`296`		`- ss << "Slave QP Count: " << slave_qp_num;`
	`296`	`+ ss << ", Slave QP Count: " << slave_qp_num;`
`297`	`297`	`return ss.str();`
`298`	`298`	`}`
`299`	`299`	`/** \brief an int head */`
`@@ -384,6 +384,7 @@ struct Message {`
`384`	`384`	`meta.dst_dev_id = val.dst_device_id_;`
`385`	`385`	`}`
`386`	`386`	`}`
	`387`	`+`
`387`	`388`	`std::string DebugString() const {`
`388`	`389`	`std::stringstream ss;`
`389`	`390`	`ss << meta.DebugString();`