diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 478ea75472717..bb34d9c1873ac 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -5316,7 +5316,13 @@ def _accumulate_gradients_with_fuse(self, main_block, fp16, fused_size):
                     "copy_data": False,
                     "use_align": True,
                     "dtype": grads[0].dtype,
-                    self._op_role_key: self._op_role.Backward
+                    self._op_role_key: self._op_role.Backward,
+                    # On npu, the nan/inf check login is different with gpu.
+                    # If there are some not initialized sections in the fused var,
+                    # and the value in those sections are nan/inf, it will trigger the nan/inf check.
+                    # To avoid these problematic triggers, set constant is needed for npu
+                    "set_constant": core.is_compiled_with_npu(),
+                    "constant": float(0.0),
                 })
             offset += 1
             # For the gradient_merged_fused_var, given a init value during the coalesce op