WorkQueue update

liutiexing · liutiexing · commit 14ce793dbb20 · 2021-09-22T22:42:19.000Z
diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
@@ -73,6 +73,7 @@ class ThreadPoolTempl {
         allow_spinning_(allow_spinning),
         global_steal_partition_(EncodePartition(0, num_threads_)),
         blocked_(0),
+        num_tasks_(0),
         spinning_(0),
         done_(false),
         cancelled_(false),
@@ -143,6 +144,7 @@ class ThreadPoolTempl {
   void AddTaskWithHint(std::function<void()> fn, int start, int limit) {
     Task t = env_.CreateTask(std::move(fn));
     PerThread* pt = GetPerThread();
+    uint64_t num_tasks = num_tasks_.fetch_add(1, std::memory_order_relaxed) + 1;
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
       Queue& q = thread_data_[pt->thread_id].queue;
@@ -166,8 +168,11 @@ class ThreadPoolTempl {
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
     if (!t.f) {
-      ec_.Notify(false);
+      if (num_tasks > num_threads_ - blocked_.load(std::memory_order_relaxed)) {
+        ec_.Notify(false);
+      }
     } else {
+      num_tasks_.fetch_sub(1, std::memory_order_relaxed);
       env_.ExecuteTask(t);  // Push failed, execute directly.
     }
   }
@@ -263,6 +268,7 @@ class ThreadPoolTempl {
   std::vector<std::vector<unsigned>> all_coprimes_;
   unsigned global_steal_partition_;
   std::atomic<unsigned> blocked_;
+  std::atomic<uint64_t> num_tasks_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
   std::atomic<bool> cancelled_;
@@ -305,6 +311,7 @@ class ThreadPoolTempl {
         }
         if (t.f) {
           env_.ExecuteTask(t);
+          num_tasks_.fetch_sub(1, std::memory_order_relaxed);
         }
       }
     } else {
@@ -315,16 +322,14 @@ class ThreadPoolTempl {
           if (!t.f) {
             t = GlobalSteal();
             if (!t.f) {
-              // Leave one thread spinning. This reduces latency.
-              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+              if (allow_spinning_) {
                 for (int i = 0; i < spin_count && !t.f; i++) {
                   if (!cancelled_.load(std::memory_order_relaxed)) {
                     t = GlobalSteal();
                   } else {
                     return;
                   }
                 }
-                spinning_ = false;
               }
               if (!t.f) {
                 if (!WaitForWork(waiter, &t)) {
@@ -336,6 +341,7 @@ class ThreadPoolTempl {
         }
         if (t.f) {
           env_.ExecuteTask(t);
+          num_tasks_.fetch_sub(1, std::memory_order_relaxed);
         }
       }
     }