diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index d1f627070f4e9a..6cd9a1347f6322 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -1673,6 +1673,22 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } } #endif +#if defined(PADDLE_WITH_XPU) + if (platform::is_xpu_place(place)) { + if (!GetPrivate()->IsStreamSafeCUDAAllocatorUsed()) { + return Alloc(place, size); + } + platform::XPUPlace p(place); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + XPUStream s = reinterpret_cast(stream.id()); + return GetPrivate() + ->GetAllocator(p, s, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return GetPrivate()->GetAllocator(p, size)->Allocate(size); + } + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed() && @@ -1690,8 +1706,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } else { return m->GetAllocator(p, size)->Allocate(size); } -#elif defined(PADDLE_WITH_XPU) - return GetAllocator(place)->Allocate(size); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with GPU or XPU or CustomDevice.")); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0c40da19d47e5f..7dae4436854ef4 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -41,7 +41,7 @@ std::shared_ptr AllocShared(const platform::Place& place, place, size, stream); } -AllocationPtr Alloc(const platform::CUDAPlace& place, +AllocationPtr Alloc(const platform::Place& place, size_t size, const phi::Stream& stream) { return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index dc25b85c8b0402..4cd642aedcdf7a 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -40,7 +40,7 @@ extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, const phi::Stream& stream); -extern AllocationPtr Alloc(const platform::CUDAPlace& place, +extern AllocationPtr Alloc(const platform::Place& place, size_t size, const phi::Stream& stream); diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index f1240bcb348f00..72f448a4a9aff0 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -20,6 +20,7 @@ #include "paddle/common/exception.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/os_info.h" #include "xpu/runtime.h" @@ -130,6 +131,39 @@ struct XPUContext::Impl { } } + class XHPCBufferManager { + public: + void* Alloc(const Place& place, size_t size, XPUStream xpu_stream) { + VLOG(3) << "Alloc " << size << " bytes from XHPC on stream " + << xpu_stream; + phi::Stream stream(reinterpret_cast(xpu_stream)); + auto allocation = memory_utils::Alloc(place, size, stream); + void* ret = allocation.get()->ptr(); + allocations_to_free_.back().push_back(std::move(allocation)); + return ret; + } + + void Save() { + allocations_to_free_.emplace_back(); + VLOG(3) << "XHPC ctx_guard created, " << GetStackLevel() + << " are in use now."; + } + + void Free() { + PADDLE_ENFORCE_GT(GetStackLevel(), + 0, + errors::PreconditionNotMet( + "No ctx_guard when overload_free is called")); + allocations_to_free_.pop_back(); + VLOG(3) << "XHPC ctx_guard destropyed, " << GetStackLevel() + << " are in use now."; + } + + private: + size_t GetStackLevel() const { return allocations_to_free_.size(); } + std::vector> allocations_to_free_; + }; + void Init(int64_t gm_default_size = 1024, int64_t l3_default_size = 1024) { owned_ = true; backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); @@ -137,17 +171,41 @@ struct XPUContext::Impl { << "Please NOTE: xpu device: " << static_cast(place_.device); context_ = xpu::create_context(); - context_->set_option("XPUAPI_DEFAULT_SIZE", - std::to_string(gm_default_size).c_str()); - VLOG(3) << "xpu place " << static_cast(place_.GetDeviceId()) - << "context " << context_ << " set xpuapi_default_size " - << gm_default_size; if (std::getenv("XPU_CDNN_CLUSTER_PARALLEL") != nullptr) { XPUStream s; xpu_stream_create(&s); context_->set_stream(s); } + + if (std::getenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD") == nullptr) { + // overload ctx alloc/free to avoid xpu_malloc/xpu_wait + auto overload_alloc_fn = + [&xhpc_buf_mgr = xhpc_buf_mgr_, + &place = place_, + s = context_->get_stream()](size_t size) -> void* { + return xhpc_buf_mgr.Alloc(place, size, s); + }; + auto overload_save_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() { + xhpc_buf_mgr.Save(); + }; + auto overload_free_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() { + xhpc_buf_mgr.Free(); + }; + context_->set_overload_alloc( + overload_alloc_fn, overload_free_fn, overload_save_fn); + gm_default_size = 1; + VLOG(1) << "XPUAPI_DEFUAULT_SIZE is disabled because you overload the " + "alloc of xhpc. If you want to use XPUAPI_DEFAULT_SIZE, " + "please set XPU_PADDLE_DISABLE_ALLOC_OVERLOAD=1"; + } + + context_->set_option("XPUAPI_DEFAULT_SIZE", + std::to_string(gm_default_size).c_str()); + VLOG(3) << "xpu place " << static_cast(place_.GetDeviceId()) + << "context " << context_ << " set xpuapi_default_size " + << gm_default_size; + xpu_version_ = backends::xpu::get_xpu_version(place_.device); SetL3Cache(l3_default_size); } @@ -220,6 +278,7 @@ struct XPUContext::Impl { // NOTE: Distributed communicator, distributed framework manages its // resources, XPUContext only holds references. xpu::BKCLContext_t bkcl_context_{nullptr}; + XHPCBufferManager xhpc_buf_mgr_; }; static int64_t get_gm_size(int i) { diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc index 42e36ece2615f1..47b3ab3cc9107b 100644 --- a/paddle/phi/common/memory_utils.cc +++ b/paddle/phi/common/memory_utils.cc @@ -18,7 +18,7 @@ namespace phi { namespace memory_utils { -Allocator::AllocationPtr Alloc(const phi::GPUPlace& place, +Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size, const phi::Stream& stream) { return MemoryUtils::Instance().Alloc(place, size, stream); diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h index 43ba72fa736b21..a7a1884dc243c5 100644 --- a/paddle/phi/common/memory_utils.h +++ b/paddle/phi/common/memory_utils.h @@ -52,8 +52,7 @@ struct MemoryInterface { * @param[size_t] size memory size * @param[phi::Stream]stream the stream that is used for allocator */ - - Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place, + Allocator::AllocationPtr (*alloc_with_stream)(const phi::Place& place, size_t size, const phi::Stream& stream); @@ -192,7 +191,7 @@ class MemoryUtils { memory_method_ = std::move(memory_method); } - Allocator::AllocationPtr Alloc(const phi::GPUPlace& place, + Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size, const phi::Stream& stream) { CheckMemoryMethod(); @@ -419,7 +418,7 @@ class MemoryUtils { namespace memory_utils { -TEST_API Allocator::AllocationPtr Alloc(const phi::GPUPlace& place, +TEST_API Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size, const phi::Stream& stream); diff --git a/test/xpu/cpp/CMakeLists.txt b/test/xpu/cpp/CMakeLists.txt index 8d1576446e9f34..abe42e8bb28b74 100644 --- a/test/xpu/cpp/CMakeLists.txt +++ b/test/xpu/cpp/CMakeLists.txt @@ -1 +1,2 @@ paddle_test(enforce_xpu_test SRCS enforce_xpu_test.cc) +paddle_test(overload_xpu_alloc_test SRCS overload_xpu_alloc_test.cc) diff --git a/test/xpu/cpp/overload_xpu_alloc_test.cc b/test/xpu/cpp/overload_xpu_alloc_test.cc new file mode 100644 index 00000000000000..77b5dd28cde91a --- /dev/null +++ b/test/xpu/cpp/overload_xpu_alloc_test.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/stats.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" + +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { +TEST(XPUOverloadAllocTest, EnvTest) { + setenv("XPUAPI_DEFAULT_SIZE", "4096", 1); + // use alloc overload + unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD"); + phi::XPUContext dev_ctx_overload( + phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId())); + EXPECT_STREQ(dev_ctx_overload.x_context()->get_option("XPUAPI_DEFAULT_SIZE"), + "1"); + EXPECT_NE(dev_ctx_overload.x_context()->overload_alloc_gm, nullptr); + // do not use alloc overload + setenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD", "1", 1); + phi::XPUContext dev_ctx_origin( + phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId())); + EXPECT_STREQ(dev_ctx_origin.x_context()->get_option("XPUAPI_DEFAULT_SIZE"), + "4096"); + EXPECT_EQ(dev_ctx_origin.x_context()->overload_alloc_gm, nullptr); + unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD"); + unsetenv("XPUAPI_DEFAULT_SIZE"); +} + +TEST(XPUOverloadAllocTest, BasicTest) { + phi::XPUContext dev_ctx( + phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId())); + int numel = 64; + int alignment = phi::backends::xpu::XPUMinChunkSize(); + int expected_alloc_size = + allocation::AlignedSize(numel * sizeof(int), alignment); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + int* buffer = RAII_GUARD.alloc(numel); + int after_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + EXPECT_NE(buffer, nullptr); + EXPECT_EQ(after_alloc_value - pre_alloc_value, expected_alloc_size); +} + +TEST(XPUOverloadAllocTest, NestedScopeTest) { + phi::XPUContext dev_ctx( + phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId())); + xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context()); + int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + int* buffer_outter = RAII_GUARD1.alloc(64); + EXPECT_NE(buffer_outter, nullptr); + { + // The destruction of inner guard should not free the memory allocated from + // outter guard. + xpu::ctx_guard RAII_GUARD2(dev_ctx.x_context()); + int* buffer_inner = RAII_GUARD2.alloc(64); + EXPECT_NE(buffer_inner, nullptr); + } + int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + EXPECT_NE(post_alloc_value, pre_alloc_value); +} + +TEST(XPUOverloadAllocTest, MultiStreamTest) { + // Test whether stream 1 use the memory poll of stream 0. + int size = 64; + setenv("XPU_CDNN_CLUSTER_PARALLEL", "1", 1); + phi::XPUContext dev_ctx( + phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId())); + xpu::ctx_guard RAII_GUARD0(dev_ctx.x_context(0)); + xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context(1)); + int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + int* buffer0 = RAII_GUARD1.alloc(size); + EXPECT_NE(buffer0, nullptr); + { + int* buffer1 = RAII_GUARD0.alloc(size); + EXPECT_NE(buffer1, nullptr); + } + int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE( + Allocated, dev_ctx.GetPlace().GetDeviceId()); + + EXPECT_NE(pre_alloc_value, post_alloc_value); + unsetenv("XPU_CDNN_CLUSTER_PARALLEL"); +} +} // namespace memory +} // namespace paddle