PaddlePaddle
diff --git a/‎AUTHORS.md
+1 b/‎AUTHORS.md
+1
diff --git a/‎cmake/external/xpu.cmake
+1-1 b/‎cmake/external/xpu.cmake
+1-1
diff --git a/‎paddle/fluid/framework/details/nan_inf_utils_detail.cu
+27-6 b/‎paddle/fluid/framework/details/nan_inf_utils_detail.cu
+27-6
diff --git a/‎paddle/fluid/framework/details/nan_inf_utils_detail.h
+29-13 b/‎paddle/fluid/framework/details/nan_inf_utils_detail.h
+29-13
diff --git a/‎paddle/fluid/framework/ir/CMakeLists.txt
+1-1 b/‎paddle/fluid/framework/ir/CMakeLists.txt
+1-1
@@ -31,6 +31,7 @@ This is an incomplete list of authors of [Paddle](https://github.com/PaddlePaddl
 | helinwang | He-Lin Wang |
 | heliqi | Li-Qi He |
 | houj04 | HOU Jue |
+| HulekJakub | Jakub Hulek |
 | jacquesqiao | Long-Fei Qiao |
 | [jakpiase](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jakub Piasecki |
 | [jczaja](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jacek Czaja |
 
@@ -7,7 +7,7 @@ set(XPU_PROJECT "extern_xpu")
 set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 
-set(XPU_BASE_DATE "20230220")
+set(XPU_BASE_DATE "20230227")
 set(XPU_XCCL_BASE_VERSION "1.0.10")
 
 if(NOT DEFINED XPU_BASE_URL)
 
@@ -174,15 +174,19 @@ __device__ T BlockReduce(T value) {
 
 __device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan,
                                              const int64_t num_inf,
+                                             const int64_t num_zero,
                                              int64_t offset,
                                              int64_t* num_nan_ptr,
-                                             int64_t* num_inf_ptr) {
+                                             int64_t* num_inf_ptr,
+                                             int64_t* num_zero_ptr) {
   int64_t block_num_nan = BlockReduce<int64_t, 2>(num_nan);
   int64_t block_num_inf = BlockReduce<int64_t, 2>(num_inf);
+  int64_t block_num_zero = BlockReduce<int64_t, 2>(num_zero);
 
   if (threadIdx.x == 0) {
     num_nan_ptr[offset] = block_num_nan;
     num_inf_ptr[offset] = block_num_inf;
+    num_zero_ptr[offset] = block_num_zero;
   }
 }
 
@@ -233,13 +237,15 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
                                          const int64_t numel,
                                          int64_t* block_num_nan_ptr,
                                          int64_t* block_num_inf_ptr,
+                                         int64_t* block_num_zero_ptr,
                                          MT* tensor_block_max_ptr,
                                          MT* tensor_block_min_ptr,
                                          MT* tensor_block_mean_ptr) {
   int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
   int64_t num_nan = 0;
   int64_t num_inf = 0;
+  int64_t num_zero = 0;
 
   MT max_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
   MT min_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
@@ -256,10 +262,18 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
     } else if (isinf(value)) {
       num_inf += 1;
     }
+    if (value == static_cast<MT>(0)) {
+      num_zero += 1;
+    }
   }
 
-  BlockReduceNumNanInfAndWrite(
-      num_nan, num_inf, blockIdx.x, block_num_nan_ptr, block_num_inf_ptr);
+  BlockReduceNumNanInfAndWrite(num_nan,
+                               num_inf,
+                               num_zero,
+                               blockIdx.x,
+                               block_num_nan_ptr,
+                               block_num_inf_ptr,
+                               block_num_zero_ptr);
 
   BlockReduceMaxMinAndWrite<MT>(max_value,
                                 min_value,
@@ -273,6 +287,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
 template <typename T, typename MT>
 __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
                                          const int64_t* block_num_inf_ptr,
+                                         const int64_t* block_num_zero_ptr,
                                          const MT* tensor_block_max_ptr,
                                          const MT* tensor_block_min_ptr,
                                          const MT* tensor_block_mean_ptr,
@@ -283,11 +298,13 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
   if (blockIdx.x == 0 && threadIdx.x == 0) {
     int64_t num_nan = 0;
     int64_t num_inf = 0;
+    int64_t num_zero = 0;
 
     // numel_max_min <= 128
     for (int64_t i = 0; i < numel_max_min; ++i) {
       num_nan += block_num_nan_ptr[i];
       num_inf += block_num_inf_ptr[i];
+      num_zero += block_num_zero_ptr[i];
     }
 
     MT max_value = static_cast<MT>(0);
@@ -314,6 +331,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
                                   numel,
                                   num_nan,
                                   num_inf,
+                                  num_zero,
                                   max_value,
                                   min_value,
                                   mean_value,
@@ -451,11 +469,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
 
   int64_t numel_max_min = blocks;
 
-  phi::DenseTensor block_num_nan_inf;
-  block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)});
+  phi::DenseTensor block_num_nan_inf_zero;
+  block_num_nan_inf_zero.Resize({static_cast<int64_t>(3 * numel_max_min)});
   int64_t* block_num_nan_ptr =
-      dev_ctx->template Alloc<int64_t>(&block_num_nan_inf);
+      dev_ctx->template Alloc<int64_t>(&block_num_nan_inf_zero);
   int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;
+  int64_t* block_num_zero_ptr = block_num_inf_ptr + numel_max_min;
 
   phi::DenseTensor tensor_block_max_min;
   tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
@@ -468,6 +487,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                                   tensor.numel(),
                                                   block_num_nan_ptr,
                                                   block_num_inf_ptr,
+                                                  block_num_zero_ptr,
                                                   tensor_block_max_ptr,
                                                   tensor_block_min_ptr,
                                                   tensor_block_mean_ptr);
@@ -476,6 +496,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
   FindGlobalMaxMinAndPrint<T, MT>
       <<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr,
                                        block_num_inf_ptr,
+                                       block_num_zero_ptr,
                                        tensor_block_max_ptr,
                                        tensor_block_min_ptr,
                                        tensor_block_mean_ptr,
 
@@ -69,33 +69,39 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
                                        int64_t numel,
                                        int64_t num_nan,
                                        int64_t num_inf,
+                                       int64_t num_zero,
                                        MT max_value,
                                        MT min_value,
                                        MT mean_value,
                                        int check_nan_inf_level) {
   if (num_nan > 0 || num_inf > 0) {
     printf(
         "[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, "
-        "num_inf=%lld, max=%e, min=%e, mean=%e\n",
+        "num_inf=%lld, num_zero=%lld, max=%e, min=%e, mean=%e\n",
         debug_info,
-        static_cast<long long>(numel),    // NOLINT
-        static_cast<long long>(num_nan),  // NOLINT
-        static_cast<long long>(num_inf),  // NOLINT
+        static_cast<long long>(numel),     // NOLINT
+        static_cast<long long>(num_nan),   // NOLINT
+        static_cast<long long>(num_inf),   // NOLINT
+        static_cast<long long>(num_zero),  // NOLINT
         static_cast<float>(max_value),
         static_cast<float>(min_value),
         static_cast<float>(mean_value));
     if (check_nan_inf_level == 0) {
 #if defined(__NVCC__) || defined(__HIPCC__)
       PADDLE_ENFORCE(false,
-                     "There are NAN or INF (num_nan=%ld, num_inf=%lld) in %s.",
-                     static_cast<long long>(num_nan),  // NOLINT
-                     static_cast<long long>(num_inf),  // NOLINT
+                     "There are NAN or INF (num_nan=%ld, num_inf=%lld, "
+                     "num_zero=%lld) in %s.",
+                     static_cast<long long>(num_nan),   // NOLINT
+                     static_cast<long long>(num_inf),   // NOLINT
+                     static_cast<long long>(num_zero),  // NOLINT
                      debug_info);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "There are NAN or INF (num_nan=%lld, num_inf=%lld) in %s.",
-          static_cast<long long>(num_nan),  // NOLINT
-          static_cast<long long>(num_inf),  // NOLINT
+          "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
+          "%s.",
+          static_cast<long long>(num_nan),   // NOLINT
+          static_cast<long long>(num_inf),   // NOLINT
+          static_cast<long long>(num_zero),  // NOLINT
           debug_info));
 #endif
     }
@@ -114,6 +120,7 @@ void PrintForDifferentLevelFile(const char* debug_info,
                                 int64_t numel,
                                 int64_t num_nan,
                                 int64_t num_inf,
+                                int64_t num_zero,
                                 MT max_value,
                                 MT min_value,
                                 MT mean_value,
@@ -136,9 +143,10 @@ void PrintForDifferentLevelFile(const char* debug_info,
 
   if (num_nan > 0 || num_inf > 0) {
     outfile << "[PRECISION] [ERROR] in " << debug_info
-            << ", numel=" << static_cast<long long>(numel)      // NOLINT
-            << ", num_nan=" << static_cast<long long>(num_nan)  // NOLINT
-            << ", num_inf=" << static_cast<long long>(num_inf)  // NOLINT
+            << ", numel=" << static_cast<long long>(numel)        // NOLINT
+            << ", num_nan=" << static_cast<long long>(num_nan)    // NOLINT
+            << ", num_inf=" << static_cast<long long>(num_inf)    // NOLINT
+            << ", num_zero=" << static_cast<long long>(num_zero)  // NOLINT
             << ", max=" << static_cast<float>(max_value)
             << ", min=" << static_cast<float>(min_value)
             << ", mean=" << static_cast<float>(mean_value) << std::endl;
@@ -200,6 +208,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
 
   std::vector<int64_t> thread_num_nan(num_threads, 0);
   std::vector<int64_t> thread_num_inf(num_threads, 0);
+  std::vector<int64_t> thread_num_zero(num_threads, 0);
   std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
   std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
   std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
@@ -230,17 +239,22 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
       } else if (std::isinf(value)) {
         thread_num_inf[tid] += 1;
       }
+      if (value == 0) {
+        thread_num_zero[tid] += 1;
+      }
     }
   }
 
   int64_t num_nan = 0;
   int64_t num_inf = 0;
+  int64_t num_zero = 0;
   MT min_value = thread_min_value[0];
   MT max_value = thread_max_value[0];
   MT mean_value = static_cast<MT>(0);
   for (int i = 0; i < num_threads; ++i) {
     num_nan += thread_num_nan[i];
     num_inf += thread_num_inf[i];
+    num_zero += thread_num_zero[i];
     min_value = std::min(thread_min_value[i], min_value);
     max_value = std::max(thread_max_value[i], max_value);
     mean_value += thread_mean_value[i];
@@ -254,6 +268,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
                                       numel,
                                       num_nan,
                                       num_inf,
+                                      num_zero,
                                       max_value,
                                       min_value,
                                       mean_value,
@@ -266,6 +281,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
                                 numel,
                                 num_nan,
                                 num_inf,
+                                num_zero,
                                 max_value,
                                 min_value,
                                 mean_value,
 
@@ -215,7 +215,7 @@ if(WITH_XPU)
   cc_library(
     xpu_quant_utils
     SRCS xpu/quant_utils.cc
-    DEPS pass)
+    DEPS pass phi)
   cc_library(
     xpu_pass_utils
     SRCS xpu/pass_utils.cc