diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index d1f627070f4e9a..6cd9a1347f6322 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1673,6 +1673,22 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
     }
   }
 #endif
+#if defined(PADDLE_WITH_XPU)
+  if (platform::is_xpu_place(place)) {
+    if (!GetPrivate()->IsStreamSafeCUDAAllocatorUsed()) {
+      return Alloc(place, size);
+    }
+    platform::XPUPlace p(place);
+    if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+      XPUStream s = reinterpret_cast<XPUStream>(stream.id());
+      return GetPrivate()
+          ->GetAllocator(p, s, /* create_if_not_found = */ true)
+          ->Allocate(size);
+    } else {
+      return GetPrivate()->GetAllocator(p, size)->Allocate(size);
+    }
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed() &&
@@ -1690,8 +1706,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   } else {
     return m->GetAllocator(p, size)->Allocate(size);
   }
-#elif defined(PADDLE_WITH_XPU)
-  return GetAllocator(place)->Allocate(size);
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
       "Not compiled with GPU or XPU or CustomDevice."));
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c40da19d47e5f..7dae4436854ef4 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -41,7 +41,7 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
       place, size, stream);
 }
 
-AllocationPtr Alloc(const platform::CUDAPlace& place,
+AllocationPtr Alloc(const platform::Place& place,
                     size_t size,
                     const phi::Stream& stream) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index dc25b85c8b0402..4cd642aedcdf7a 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -40,7 +40,7 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
                                                const phi::Stream& stream);
 
-extern AllocationPtr Alloc(const platform::CUDAPlace& place,
+extern AllocationPtr Alloc(const platform::Place& place,
                            size_t size,
                            const phi::Stream& stream);
 
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index f1240bcb348f00..72f448a4a9aff0 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/common/exception.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/os_info.h"
 #include "xpu/runtime.h"
@@ -130,6 +131,39 @@ struct XPUContext::Impl {
     }
   }
 
+  class XHPCBufferManager {
+   public:
+    void* Alloc(const Place& place, size_t size, XPUStream xpu_stream) {
+      VLOG(3) << "Alloc " << size << " bytes from XHPC on stream "
+              << xpu_stream;
+      phi::Stream stream(reinterpret_cast<StreamId>(xpu_stream));
+      auto allocation = memory_utils::Alloc(place, size, stream);
+      void* ret = allocation.get()->ptr();
+      allocations_to_free_.back().push_back(std::move(allocation));
+      return ret;
+    }
+
+    void Save() {
+      allocations_to_free_.emplace_back();
+      VLOG(3) << "XHPC ctx_guard created, " << GetStackLevel()
+              << " are in use now.";
+    }
+
+    void Free() {
+      PADDLE_ENFORCE_GT(GetStackLevel(),
+                        0,
+                        errors::PreconditionNotMet(
+                            "No ctx_guard when overload_free is called"));
+      allocations_to_free_.pop_back();
+      VLOG(3) << "XHPC ctx_guard destropyed, " << GetStackLevel()
+              << " are in use now.";
+    }
+
+   private:
+    size_t GetStackLevel() const { return allocations_to_free_.size(); }
+    std::vector<std::vector<Allocator::AllocationPtr>> allocations_to_free_;
+  };
+
   void Init(int64_t gm_default_size = 1024, int64_t l3_default_size = 1024) {
     owned_ = true;
     backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
@@ -137,17 +171,41 @@ struct XPUContext::Impl {
         << "Please NOTE: xpu device: " << static_cast<int>(place_.device);
 
     context_ = xpu::create_context();
-    context_->set_option("XPUAPI_DEFAULT_SIZE",
-                         std::to_string(gm_default_size).c_str());
-    VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
-            << "context " << context_ << " set xpuapi_default_size "
-            << gm_default_size;
 
     if (std::getenv("XPU_CDNN_CLUSTER_PARALLEL") != nullptr) {
       XPUStream s;
       xpu_stream_create(&s);
       context_->set_stream(s);
     }
+
+    if (std::getenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD") == nullptr) {
+      // overload ctx alloc/free to avoid xpu_malloc/xpu_wait
+      auto overload_alloc_fn =
+          [&xhpc_buf_mgr = xhpc_buf_mgr_,
+           &place = place_,
+           s = context_->get_stream()](size_t size) -> void* {
+        return xhpc_buf_mgr.Alloc(place, size, s);
+      };
+      auto overload_save_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() {
+        xhpc_buf_mgr.Save();
+      };
+      auto overload_free_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() {
+        xhpc_buf_mgr.Free();
+      };
+      context_->set_overload_alloc(
+          overload_alloc_fn, overload_free_fn, overload_save_fn);
+      gm_default_size = 1;
+      VLOG(1) << "XPUAPI_DEFUAULT_SIZE is disabled because you overload the "
+                 "alloc of xhpc. If you want to use XPUAPI_DEFAULT_SIZE, "
+                 "please set XPU_PADDLE_DISABLE_ALLOC_OVERLOAD=1";
+    }
+
+    context_->set_option("XPUAPI_DEFAULT_SIZE",
+                         std::to_string(gm_default_size).c_str());
+    VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
+            << "context " << context_ << " set xpuapi_default_size "
+            << gm_default_size;
+
     xpu_version_ = backends::xpu::get_xpu_version(place_.device);
     SetL3Cache(l3_default_size);
   }
@@ -220,6 +278,7 @@ struct XPUContext::Impl {
   // NOTE: Distributed communicator, distributed framework manages its
   // resources, XPUContext only holds references.
   xpu::BKCLContext_t bkcl_context_{nullptr};
+  XHPCBufferManager xhpc_buf_mgr_;
 };
 
 static int64_t get_gm_size(int i) {
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index 42e36ece2615f1..47b3ab3cc9107b 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -18,7 +18,7 @@ namespace phi {
 
 namespace memory_utils {
 
-Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+Allocator::AllocationPtr Alloc(const phi::Place& place,
                                size_t size,
                                const phi::Stream& stream) {
   return MemoryUtils::Instance().Alloc(place, size, stream);
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index 43ba72fa736b21..a7a1884dc243c5 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -52,8 +52,7 @@ struct MemoryInterface {
    * @param[size_t]     size      memory size
    * @param[phi::Stream]stream    the stream that is used for allocator
    */
-
-  Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place,
+  Allocator::AllocationPtr (*alloc_with_stream)(const phi::Place& place,
                                                 size_t size,
                                                 const phi::Stream& stream);
 
@@ -192,7 +191,7 @@ class MemoryUtils {
     memory_method_ = std::move(memory_method);
   }
 
-  Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+  Allocator::AllocationPtr Alloc(const phi::Place& place,
                                  size_t size,
                                  const phi::Stream& stream) {
     CheckMemoryMethod();
@@ -419,7 +418,7 @@ class MemoryUtils {
 
 namespace memory_utils {
 
-TEST_API Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+TEST_API Allocator::AllocationPtr Alloc(const phi::Place& place,
                                         size_t size,
                                         const phi::Stream& stream);
 
diff --git a/test/xpu/cpp/CMakeLists.txt b/test/xpu/cpp/CMakeLists.txt
index 8d1576446e9f34..abe42e8bb28b74 100644
--- a/test/xpu/cpp/CMakeLists.txt
+++ b/test/xpu/cpp/CMakeLists.txt
@@ -1 +1,2 @@
 paddle_test(enforce_xpu_test SRCS enforce_xpu_test.cc)
+paddle_test(overload_xpu_alloc_test SRCS overload_xpu_alloc_test.cc)
diff --git a/test/xpu/cpp/overload_xpu_alloc_test.cc b/test/xpu/cpp/overload_xpu_alloc_test.cc
new file mode 100644
index 00000000000000..77b5dd28cde91a
--- /dev/null
+++ b/test/xpu/cpp/overload_xpu_alloc_test.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/stats.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+TEST(XPUOverloadAllocTest, EnvTest) {
+  setenv("XPUAPI_DEFAULT_SIZE", "4096", 1);
+  // use alloc overload
+  unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD");
+  phi::XPUContext dev_ctx_overload(
+      phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
+  EXPECT_STREQ(dev_ctx_overload.x_context()->get_option("XPUAPI_DEFAULT_SIZE"),
+               "1");
+  EXPECT_NE(dev_ctx_overload.x_context()->overload_alloc_gm, nullptr);
+  // do not use alloc overload
+  setenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD", "1", 1);
+  phi::XPUContext dev_ctx_origin(
+      phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
+  EXPECT_STREQ(dev_ctx_origin.x_context()->get_option("XPUAPI_DEFAULT_SIZE"),
+               "4096");
+  EXPECT_EQ(dev_ctx_origin.x_context()->overload_alloc_gm, nullptr);
+  unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD");
+  unsetenv("XPUAPI_DEFAULT_SIZE");
+}
+
+TEST(XPUOverloadAllocTest, BasicTest) {
+  phi::XPUContext dev_ctx(
+      phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
+  int numel = 64;
+  int alignment = phi::backends::xpu::XPUMinChunkSize();
+  int expected_alloc_size =
+      allocation::AlignedSize(numel * sizeof(int), alignment);
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+  int* buffer = RAII_GUARD.alloc<int>(numel);
+  int after_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+  EXPECT_NE(buffer, nullptr);
+  EXPECT_EQ(after_alloc_value - pre_alloc_value, expected_alloc_size);
+}
+
+TEST(XPUOverloadAllocTest, NestedScopeTest) {
+  phi::XPUContext dev_ctx(
+      phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
+  xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context());
+  int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+  int* buffer_outter = RAII_GUARD1.alloc<int>(64);
+  EXPECT_NE(buffer_outter, nullptr);
+  {
+    // The destruction of inner guard should not free the memory allocated from
+    // outter guard.
+    xpu::ctx_guard RAII_GUARD2(dev_ctx.x_context());
+    int* buffer_inner = RAII_GUARD2.alloc<int>(64);
+    EXPECT_NE(buffer_inner, nullptr);
+  }
+  int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+  EXPECT_NE(post_alloc_value, pre_alloc_value);
+}
+
+TEST(XPUOverloadAllocTest, MultiStreamTest) {
+  // Test whether stream 1 use the memory poll of stream 0.
+  int size = 64;
+  setenv("XPU_CDNN_CLUSTER_PARALLEL", "1", 1);
+  phi::XPUContext dev_ctx(
+      phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
+  xpu::ctx_guard RAII_GUARD0(dev_ctx.x_context(0));
+  xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context(1));
+  int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+  int* buffer0 = RAII_GUARD1.alloc<int>(size);
+  EXPECT_NE(buffer0, nullptr);
+  {
+    int* buffer1 = RAII_GUARD0.alloc<int>(size);
+    EXPECT_NE(buffer1, nullptr);
+  }
+  int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+      Allocated, dev_ctx.GetPlace().GetDeviceId());
+
+  EXPECT_NE(pre_alloc_value, post_alloc_value);
+  unsetenv("XPU_CDNN_CLUSTER_PARALLEL");
+}
+}  // namespace memory
+}  // namespace paddle