Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU] merge memery pool for Paddle and XHPC by using alloc overloading #63924

Merged
merged 11 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,22 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
}
}
#endif
#if defined(PADDLE_WITH_XPU)
if (platform::is_xpu_place(place)) {
if (!GetPrivate()->IsStreamSafeCUDAAllocatorUsed()) {
return Alloc(place, size);
}
platform::XPUPlace p(place);
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
XPUStream s = reinterpret_cast<XPUStream>(stream.id());
return GetPrivate()
->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return GetPrivate()->GetAllocator(p, size)->Allocate(size);
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocatorFacadePrivate* m = GetPrivate();
if (!m->IsStreamSafeCUDAAllocatorUsed() &&
Expand All @@ -1690,8 +1706,6 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
} else {
return m->GetAllocator(p, size)->Allocate(size);
}
#elif defined(PADDLE_WITH_XPU)
return GetAllocator(place)->Allocate(size);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not compiled with GPU or XPU or CustomDevice."));
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/malloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
place, size, stream);
}

AllocationPtr Alloc(const platform::CUDAPlace& place,
AllocationPtr Alloc(const platform::Place& place,
size_t size,
const phi::Stream& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/malloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t size,
const phi::Stream& stream);

extern AllocationPtr Alloc(const platform::CUDAPlace& place,
extern AllocationPtr Alloc(const platform::Place& place,
size_t size,
const phi::Stream& stream);

Expand Down
69 changes: 64 additions & 5 deletions paddle/phi/backends/xpu/xpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "paddle/common/exception.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/os_info.h"
#include "xpu/runtime.h"
Expand Down Expand Up @@ -130,24 +131,81 @@ struct XPUContext::Impl {
}
}

class XHPCBufferManager {
public:
void* Alloc(const Place& place, size_t size, XPUStream xpu_stream) {
VLOG(3) << "Alloc " << size << " bytes from XHPC on stream "
<< xpu_stream;
phi::Stream stream(reinterpret_cast<StreamId>(xpu_stream));
auto allocation = memory_utils::Alloc(place, size, stream);
void* ret = allocation.get()->ptr();
allocations_to_free_.back().push_back(std::move(allocation));
return ret;
}

void Save() {
allocations_to_free_.emplace_back();
VLOG(3) << "XHPC ctx_guard created, " << GetStackLevel()
<< " are in use now.";
}

void Free() {
PADDLE_ENFORCE_GT(GetStackLevel(),
0,
errors::PreconditionNotMet(
"No ctx_guard when overload_free is called"));
allocations_to_free_.pop_back();
VLOG(3) << "XHPC ctx_guard destropyed, " << GetStackLevel()
<< " are in use now.";
}

private:
size_t GetStackLevel() const { return allocations_to_free_.size(); }
std::vector<std::vector<Allocator::AllocationPtr>> allocations_to_free_;
};

void Init(int64_t gm_default_size = 1024, int64_t l3_default_size = 1024) {
owned_ = true;
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
LOG_FIRST_N(WARNING, 1)
<< "Please NOTE: xpu device: " << static_cast<int>(place_.device);

context_ = xpu::create_context();
context_->set_option("XPUAPI_DEFAULT_SIZE",
std::to_string(gm_default_size).c_str());
VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
<< "context " << context_ << " set xpuapi_default_size "
<< gm_default_size;

if (std::getenv("XPU_CDNN_CLUSTER_PARALLEL") != nullptr) {
XPUStream s;
xpu_stream_create(&s);
context_->set_stream(s);
}

if (std::getenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD") == nullptr) {
// overload ctx alloc/free to avoid xpu_malloc/xpu_wait
auto overload_alloc_fn =
[&xhpc_buf_mgr = xhpc_buf_mgr_,
&place = place_,
s = context_->get_stream()](size_t size) -> void* {
return xhpc_buf_mgr.Alloc(place, size, s);
};
auto overload_save_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() {
xhpc_buf_mgr.Save();
};
auto overload_free_fn = [&xhpc_buf_mgr = xhpc_buf_mgr_]() {
xhpc_buf_mgr.Free();
};
context_->set_overload_alloc(
overload_alloc_fn, overload_free_fn, overload_save_fn);
gm_default_size = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是不是还会有那个BufferMgr align to 64的warning

Copy link
Contributor Author

@lj970926 lj970926 May 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cd97c0d7d2265a92a63ba3fea7773436 对于GM,API会自动做64 Bytes对齐(L3不会)。这里设为1是因为之前API有bug,设为0的话buffer不会释放。(目前已修复,但Paddle还没合入,后续会统一换成0)

VLOG(1) << "XPUAPI_DEFUAULT_SIZE is disabled because you overload the "
"alloc of xhpc. If you want to use XPUAPI_DEFAULT_SIZE, "
"please set XPU_PADDLE_DISABLE_ALLOC_OVERLOAD=1";
}

context_->set_option("XPUAPI_DEFAULT_SIZE",
std::to_string(gm_default_size).c_str());
VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
<< "context " << context_ << " set xpuapi_default_size "
<< gm_default_size;

xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache(l3_default_size);
}
Expand Down Expand Up @@ -220,6 +278,7 @@ struct XPUContext::Impl {
// NOTE: Distributed communicator, distributed framework manages its
// resources, XPUContext only holds references.
xpu::BKCLContext_t bkcl_context_{nullptr};
XHPCBufferManager xhpc_buf_mgr_;
};

static int64_t get_gm_size(int i) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/common/memory_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace phi {

namespace memory_utils {

Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
Allocator::AllocationPtr Alloc(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
return MemoryUtils::Instance().Alloc(place, size, stream);
Expand Down
7 changes: 3 additions & 4 deletions paddle/phi/common/memory_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ struct MemoryInterface {
* @param[size_t] size memory size
* @param[phi::Stream]stream the stream that is used for allocator
*/

Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place,
Allocator::AllocationPtr (*alloc_with_stream)(const phi::Place& place,
size_t size,
const phi::Stream& stream);

Expand Down Expand Up @@ -192,7 +191,7 @@ class MemoryUtils {
memory_method_ = std::move(memory_method);
}

Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
Allocator::AllocationPtr Alloc(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
CheckMemoryMethod();
Expand Down Expand Up @@ -419,7 +418,7 @@ class MemoryUtils {

namespace memory_utils {

TEST_API Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
TEST_API Allocator::AllocationPtr Alloc(const phi::Place& place,
size_t size,
const phi::Stream& stream);

Expand Down
1 change: 1 addition & 0 deletions test/xpu/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
paddle_test(enforce_xpu_test SRCS enforce_xpu_test.cc)
paddle_test(overload_xpu_alloc_test SRCS overload_xpu_alloc_test.cc)
104 changes: 104 additions & 0 deletions test/xpu/cpp/overload_xpu_alloc_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"

#include "gtest/gtest.h"

namespace paddle {
namespace memory {
TEST(XPUOverloadAllocTest, EnvTest) {
setenv("XPUAPI_DEFAULT_SIZE", "4096", 1);
// use alloc overload
unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD");
phi::XPUContext dev_ctx_overload(
phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
EXPECT_STREQ(dev_ctx_overload.x_context()->get_option("XPUAPI_DEFAULT_SIZE"),
"1");
EXPECT_NE(dev_ctx_overload.x_context()->overload_alloc_gm, nullptr);
// do not use alloc overload
setenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD", "1", 1);
phi::XPUContext dev_ctx_origin(
phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
EXPECT_STREQ(dev_ctx_origin.x_context()->get_option("XPUAPI_DEFAULT_SIZE"),
"4096");
EXPECT_EQ(dev_ctx_origin.x_context()->overload_alloc_gm, nullptr);
unsetenv("XPU_PADDLE_DISABLE_ALLOC_OVERLOAD");
unsetenv("XPUAPI_DEFAULT_SIZE");
}

TEST(XPUOverloadAllocTest, BasicTest) {
phi::XPUContext dev_ctx(
phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
int numel = 64;
int alignment = phi::backends::xpu::XPUMinChunkSize();
int expected_alloc_size =
allocation::AlignedSize(numel * sizeof(int), alignment);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());
int* buffer = RAII_GUARD.alloc<int>(numel);
int after_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());
EXPECT_NE(buffer, nullptr);
EXPECT_EQ(after_alloc_value - pre_alloc_value, expected_alloc_size);
}

TEST(XPUOverloadAllocTest, NestedScopeTest) {
phi::XPUContext dev_ctx(
phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context());
int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());
int* buffer_outter = RAII_GUARD1.alloc<int>(64);
EXPECT_NE(buffer_outter, nullptr);
{
// The destruction of inner guard should not free the memory allocated from
// outter guard.
xpu::ctx_guard RAII_GUARD2(dev_ctx.x_context());
int* buffer_inner = RAII_GUARD2.alloc<int>(64);
EXPECT_NE(buffer_inner, nullptr);
}
int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());
EXPECT_NE(post_alloc_value, pre_alloc_value);
}

TEST(XPUOverloadAllocTest, MultiStreamTest) {
// Test whether stream 1 use the memory poll of stream 0.
int size = 64;
setenv("XPU_CDNN_CLUSTER_PARALLEL", "1", 1);
phi::XPUContext dev_ctx(
phi::XPUPlace(phi::backends::xpu::GetXPUCurrentDeviceId()));
xpu::ctx_guard RAII_GUARD0(dev_ctx.x_context(0));
xpu::ctx_guard RAII_GUARD1(dev_ctx.x_context(1));
int pre_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());
int* buffer0 = RAII_GUARD1.alloc<int>(size);
EXPECT_NE(buffer0, nullptr);
{
int* buffer1 = RAII_GUARD0.alloc<int>(size);
EXPECT_NE(buffer1, nullptr);
}
int post_alloc_value = DEVICE_MEMORY_STAT_CURRENT_VALUE(
Allocated, dev_ctx.GetPlace().GetDeviceId());

EXPECT_NE(pre_alloc_value, post_alloc_value);
unsetenv("XPU_CDNN_CLUSTER_PARALLEL");
}
} // namespace memory
} // namespace paddle