diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 693c1cd47b0de0..d61380608c01b8 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -195,7 +195,7 @@ class Allocator : public phi::Allocator {
 
  protected:
   virtual phi::Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(phi::Allocation* allocation);
+  TEST_API virtual void FreeImpl(phi::Allocation* allocation);
   virtual uint64_t ReleaseImpl(const platform::Place& place UNUSED) {
     return 0;
   }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index f80fcac1b2a38a..984b7197c51997 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -64,7 +64,7 @@ class AllocatorFacade {
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
   // Allocate a unique allocation.
-  AllocationPtr Alloc(const platform::Place& place, size_t size);
+  TEST_API AllocationPtr Alloc(const platform::Place& place, size_t size);
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 7e3a18955ac67e..47f8ed80bb2e53 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -105,13 +105,13 @@ class BestFitAllocation : public Allocation {
 // the prev-chunk and the next-chunk when possible.
 class BestFitAllocator : public Allocator {
  public:
-  explicit BestFitAllocator(phi::Allocation* allocation);
+  TEST_API explicit BestFitAllocator(phi::Allocation* allocation);
 
   void* BasePtr() const { return allocation_->ptr(); }
 
   const platform::Place& Place() const { return allocation_->place(); }
 
-  size_t NumFreeChunks() const;
+  TEST_API size_t NumFreeChunks() const;
 
   bool IsAllocThreadSafe() const override { return true; }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index 08a005dc793597..f15f3e1006848f 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -32,7 +32,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
+  TEST_API explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
 
   ~BufferedAllocator();
 
@@ -42,7 +42,7 @@ class BufferedAllocator : public Allocator {
   inline void ClearCache() { FreeCache(-1UL); }
 
  private:
-  void FreeCache(size_t size);
+  TEST_API void FreeCache(size_t size);
 
  protected:
   void FreeImpl(phi::Allocation *allocation) override;
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 30e367f6e7f7ce..5f5420281f0ee7 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -34,11 +34,11 @@ namespace allocation {
 class CPUAllocator : public Allocator {
  public:
   constexpr static size_t kAlignment = 4096UL;
-  bool IsAllocThreadSafe() const override;
+  TEST_API bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(phi::Allocation* allocation) override;
-  phi::Allocation* AllocateImpl(size_t size) override;
+  TEST_API void FreeImpl(phi::Allocation* allocation) override;
+  TEST_API phi::Allocation* AllocateImpl(size_t size) override;
 };
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 3d6500d0f56426..7a16287c75cb30 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -35,9 +35,9 @@ class NaiveBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  phi::Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(phi::Allocation *allocation) override;
-  uint64_t ReleaseImpl(const platform::Place &place) override;
+  TEST_API phi::Allocation *AllocateImpl(size_t size) override;
+  TEST_API void FreeImpl(phi::Allocation *allocation) override;
+  TEST_API uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index b6d722b62a4b03..1416ec1e034724 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -121,17 +121,21 @@ class Stat : public StatBase {
 // performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
 // xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
 // functions where ultra-low performance overhead is required.
-int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
-int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
-void DeviceMemoryStatUpdate(const std::string& stat_type,
-                            int dev_id,
-                            int64_t increment);
-
-int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
-int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
-void HostMemoryStatUpdate(const std::string& stat_type,
-                          int dev_id,
-                          int64_t increment);
+TEST_API int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
+                                              int dev_id);
+TEST_API int64_t DeviceMemoryStatPeakValue(const std::string& stat_type,
+                                           int dev_id);
+TEST_API void DeviceMemoryStatUpdate(const std::string& stat_type,
+                                     int dev_id,
+                                     int64_t increment);
+
+TEST_API int64_t HostMemoryStatCurrentValue(const std::string& stat_type,
+                                            int dev_id);
+TEST_API int64_t HostMemoryStatPeakValue(const std::string& stat_type,
+                                         int dev_id);
+TEST_API void HostMemoryStatUpdate(const std::string& stat_type,
+                                   int dev_id,
+                                   int64_t increment);
 
 void LogDeviceMemoryStats(const platform::Place& place,
                           const std::string& op_name);
diff --git a/test/cpp/fluid/memory/CMakeLists.txt b/test/cpp/fluid/memory/CMakeLists.txt
index 5bb36f73982287..c4bf57aa7bae7c 100644
--- a/test/cpp/fluid/memory/CMakeLists.txt
+++ b/test/cpp/fluid/memory/CMakeLists.txt
@@ -1,17 +1,9 @@
-cc_test(
-  memory_stats_test
-  SRCS memory_stats_test.cc
-  DEPS)
-cc_test(
-  stats_test
-  SRCS stats_test.cc
-  DEPS)
-
-cc_test(
-  naive_best_fit_allocator_test
-  SRCS naive_best_fit_allocator_test.cc
-  DEPS allocator)
-cc_test(
+paddle_test(memory_stats_test SRCS memory_stats_test.cc)
+paddle_test(stats_test SRCS stats_test.cc)
+
+paddle_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc
+            DEPS allocator)
+nv_test(
   buffered_allocator_test
   SRCS buffered_allocator_test.cc
   DEPS allocator)
@@ -40,18 +32,15 @@ elseif(WITH_ROCM)
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
     DEPS allocator)
 else()
-  cc_test(
-    best_fit_allocator_test
-    SRCS best_fit_allocator_test.cc
-    DEPS allocator)
+  paddle_test(best_fit_allocator_test SRCS best_fit_allocator_test.cc)
 endif()
 
-cc_test(
+nv_test(
   test_aligned_allocator
   SRCS test_aligned_allocator.cc
   DEPS allocator)
 
-cc_test(
+nv_test(
   retry_allocator_test
   SRCS retry_allocator_test.cc
   DEPS allocator)
@@ -60,12 +49,10 @@ if(TEST retry_allocator_test)
                                                        "RUN_TYPE=EXCLUSIVE")
 endif()
 
-cc_test(
-  allocator_facade_abs_flags_test
-  SRCS allocator_facade_abs_flags_test.cc
-  DEPS allocator)
+paddle_test(allocator_facade_abs_flags_test SRCS
+            allocator_facade_abs_flags_test.cc)
 
-cc_test(
+nv_test(
   allocator_facade_frac_flags_test
   SRCS allocator_facade_frac_flags_test.cc
   DEPS allocator)
@@ -131,28 +118,36 @@ if(WITH_GPU AND WITH_TESTING)
                                     FLAGS_use_stream_safe_cuda_allocator=true;")
 endif()
 
-cc_test(
+paddle_test(
   auto_growth_best_fit_allocator_facade_test
   SRCS auto_growth_best_fit_allocator_facade_test.cc
   DEPS allocator)
-cc_test(
+nv_test(
   auto_growth_best_fit_allocator_test
   SRCS auto_growth_best_fit_allocator_test.cc
   DEPS allocator)
 
 if(NOT WIN32)
-  cc_test(
+  paddle_test(
     mmap_allocator_test
     SRCS mmap_allocator_test.cc
     DEPS allocator)
 endif()
 
-cc_test(
+paddle_test(
   system_allocator_test
   SRCS system_allocator_test.cc
   DEPS allocator)
 
-cc_test(
+paddle_test(
   buddy_allocator_test
   SRCS buddy_allocator_test.cc
   DEPS allocator)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(memory_stats_test)
+  copy_onnx(stats_test)
+  copy_onnx(naive_best_fit_allocator_test)
+endif()