From f818cc38f0b556549405fdb60b0fae5014ab1f36 Mon Sep 17 00:00:00 2001 From: Ben Liu <38140930+liu15@users.noreply.github.com> Date: Thu, 21 Nov 2024 13:25:50 -0800 Subject: [PATCH 1/7] Deep copy (#303) * Add deep copy option to avoid the need for xnack+ --------- Co-authored-by: Alan Dayton <6393677+adayton1@users.noreply.github.com> --- RELEASE_NOTES.md | 3 +++ cmake/SetupOptions.cmake | 1 + src/care/config.h.in | 1 + src/care/host_device_ptr.h | 23 ++++++++++++++++++++++- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 1cf191b3..3d7efc33 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -14,6 +14,9 @@ The format of this file is based on [Keep a Changelog](http://keepachangelog.com ## [Unreleased] - Release date YYYY-MM-DD +### Added +- Added CARE\_DEEP\_COPY\_RAW\_PTR configuration option. + ### Added - Added ATOMIC\_SUB, ATOMIC\_LOAD, ATOMIC\_STORE, ATOMIC\_EXCHANGE, and ATOMIC\_CAS macros. - Added TSAN\_ONLY\_ATOMIC\_\* macros to suppress tsan data race reports. Controlled by CARE\_ENABLE\_TSAN\_ONLY\_ATOMICS configuration option. diff --git a/cmake/SetupOptions.cmake b/cmake/SetupOptions.cmake index ef9b48cc..db7fe22f 100644 --- a/cmake/SetupOptions.cmake +++ b/cmake/SetupOptions.cmake @@ -19,6 +19,7 @@ option(CARE_ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw # CHAI must also be configured with the same settings for implicit conversions. set(CHAI_ENABLE_IMPLICIT_CONVERSIONS ${CARE_ENABLE_IMPLICIT_CONVERSIONS} CACHE BOOL "Enable implicit conversions to-from raw pointers") option(CARE_LEGACY_COMPATIBILITY_MODE "Enable legacy compatibility mode" OFF) +option(CARE_DEEP_COPY_RAW_PTR "Use deep copy for managed array initialization from raw pointer" OFF) option(CARE_ENABLE_MANAGED_PTR "Enable managed_ptr aliases, tests, and reproducer" ON) option(CARE_DISABLE_RAJAPLUGIN "Disable use of the RAJA plugin. WILL ALSO DISABLE MEMORY MOTION." OFF) option(CARE_ENABLE_EXTERN_INSTANTIATE "Enable extern instantiation of template functions" OFF) diff --git a/src/care/config.h.in b/src/care/config.h.in index ee7c83e0..b03790ae 100644 --- a/src/care/config.h.in +++ b/src/care/config.h.in @@ -19,6 +19,7 @@ #ifndef CARE_LEGACY_COMPATIBILITY_MODE #cmakedefine01 CARE_LEGACY_COMPATIBILITY_MODE #endif +#cmakedefine CARE_DEEP_COPY_RAW_PTR #cmakedefine CARE_ENABLE_MANAGED_PTR #cmakedefine CARE_DISABLE_RAJAPLUGIN #cmakedefine CARE_ENABLE_EXTERN_INSTANTIATE diff --git a/src/care/host_device_ptr.h b/src/care/host_device_ptr.h index fb1c23ff..3dfe40f4 100644 --- a/src/care/host_device_ptr.h +++ b/src/care/host_device_ptr.h @@ -23,6 +23,7 @@ // Std library headers #include + namespace care { /////////////////////////////////////////////////////////////////////////// /// @struct _kv @@ -138,13 +139,20 @@ namespace care { { } -#if defined (CHAI_DISABLE_RM) || defined(CHAI_THIN_GPU_ALLOCATE) /// /// @author Peter Robinson /// /// Construct from a raw pointer, size, and name /// This is defined when the CHAI resource manager is disabled /// +#if defined(CARE_DEEP_COPY_RAW_PTR) + host_device_ptr(T* from, size_t size, const char * name) + : MA(size) + { + std::copy_n(from, size, (T_non_const*)MA::data()); + } +#else /* defined(CARE_DEEP_COPY_RAW_PTR) */ +#if defined (CHAI_DISABLE_RM) || defined(CHAI_THIN_GPU_ALLOCATE) host_device_ptr(T* from, size_t size, const char * name) : MA(from, nullptr, size, nullptr) { @@ -173,6 +181,7 @@ namespace care { } } #endif +#endif /* defined(CARE_DEEP_COPY_RAW_PTR) */ /// /// @author Peter Robinson @@ -412,6 +421,16 @@ namespace care { void freeDeviceMemory(T_non_const ** CPU_destination, size_t elems, bool deregisterPointer=true) { +#if defined(CARE_DEEP_COPY_RAW_PTR) + // if there is a pointer to update ... + if (CPU_destination != nullptr) { + if (*CPU_destination == nullptr) { + *CPU_destination = (T_non_const *) std::malloc(elems*sizeof(T)); + } + std::copy_n(MA::cdata(), elems, *CPU_destination); + } + MA::free(); +#else /* defined(CARE_DEEP_COPY_RAW_PTR) */ #if !defined(CHAI_DISABLE_RM) #if defined(CHAI_GPUCC) || CARE_ENABLE_GPU_SIMULATION_MODE if (CPU_destination != nullptr) { @@ -441,6 +460,7 @@ namespace care { arrayManager->deregisterPointer(MA::m_pointer_record,true); CHAICallback::deregisterRecord(MA::m_pointer_record); } + #else // no resource manager active #if defined(CHAI_THIN_GPU_ALLOCATE) // GPU allocated thin wrapped // ... then sync to ensure data is up to date @@ -464,6 +484,7 @@ namespace care { } } #endif +#endif /* defined(CARE_DEEP_COPY_RAW_PTR) */ } CARE_HOST_DEVICE void pick(int idx, T_non_const& val) const { From 25fd008779d625704534a233dc0268f78d03eead Mon Sep 17 00:00:00 2001 From: Alan Dayton <6393677+adayton1@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:27:00 -0800 Subject: [PATCH 2/7] Remove implicit conversions (#305) --- RELEASE_NOTES.md | 1 + cmake/SetupOptions.cmake | 4 -- .../lc/toss_4_x86_64_ib_cray/amdclang.cmake | 2 - src/care/KeyValueSorter_decl.h | 51 ------------------- src/care/algorithm_decl.h | 10 ---- src/care/algorithm_impl.h | 15 ------ src/care/care_inst.h | 15 ------ src/care/config.h.in | 1 - src/care/host_device_ptr.h | 16 ------ src/care/host_ptr.h | 9 ---- 10 files changed, 1 insertion(+), 123 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3d7efc33..d3e45687 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -25,6 +25,7 @@ The format of this file is based on [Keep a Changelog](http://keepachangelog.com - Removed Accessor template parameter from host\_device\_ptr. - Removed NoOpAccessor and RaceConditionAccessor. It is recommended to use ThreadSanitizer (TSAN) instead to locate race conditions. - Removed CARE\_ENABLE\_RACE\_DETECTION configuration option. +- Removed implicit conversions between raw pointers and host\_device\_ptrs/host\_ptrs and the corresponding CARE\_ENABLE\_IMPLICIT\_CONVERSIONS configuration option. ### Changed - Renamed host\_device\_ptr::getPointer to host\_device\_ptr::data. diff --git a/cmake/SetupOptions.cmake b/cmake/SetupOptions.cmake index db7fe22f..d70b436c 100644 --- a/cmake/SetupOptions.cmake +++ b/cmake/SetupOptions.cmake @@ -14,10 +14,6 @@ option(ENABLE_PICK "Enable pick and set methods on ManagedArrays" ON) option(ENABLE_PINNED "Enable pinned memory space" ON) option(CARE_ENABLE_PINNED_MEMORY_FOR_SCANS "Use pinned memory for scan lengths" ON) option(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU "Allows default memory spaces for ZERO_COPY and PAGEABLE to be the GPU memory space" OFF) -# Option to disable implicit conversion between host_device_ptr and raw arrays in CARE. -option(CARE_ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw pointers" ON) -# CHAI must also be configured with the same settings for implicit conversions. -set(CHAI_ENABLE_IMPLICIT_CONVERSIONS ${CARE_ENABLE_IMPLICIT_CONVERSIONS} CACHE BOOL "Enable implicit conversions to-from raw pointers") option(CARE_LEGACY_COMPATIBILITY_MODE "Enable legacy compatibility mode" OFF) option(CARE_DEEP_COPY_RAW_PTR "Use deep copy for managed array initialization from raw pointer" OFF) option(CARE_ENABLE_MANAGED_PTR "Enable managed_ptr aliases, tests, and reproducer" ON) diff --git a/configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake b/configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake index 3edbf270..560e9fd1 100644 --- a/configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake +++ b/configs/lc/toss_4_x86_64_ib_cray/amdclang.cmake @@ -13,5 +13,3 @@ set(ENABLE_HIP ON CACHE BOOL "Enable Hip") set(ROCM_PATH "/usr/tce/packages/rocmcc/rocmcc-6.1.0-magic" CACHE PATH "") set(CMAKE_HIP_ARCHITECTURES "gfx942:xnack+" CACHE STRING "") set(AMDGPU_TARGETS "gfx942:xnack+" CACHE STRING "") - -set(CARE_ENABLE_IMPLICIT_CONVERSIONS OFF CACHE BOOL "Enable implicit conversions") diff --git a/src/care/KeyValueSorter_decl.h b/src/care/KeyValueSorter_decl.h index 8fd0101c..a5fc9d9c 100644 --- a/src/care/KeyValueSorter_decl.h +++ b/src/care/KeyValueSorter_decl.h @@ -174,31 +174,6 @@ class CARE_DLL_API KeyValueSorter { setKeyValueArraysFromManagedArray(m_keys, m_values, len, arr); } -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - - /////////////////////////////////////////////////////////////////////////// - /// @author Alan Dayton - /// - /// @brief Constructor - /// - /// Allocates space and initializes the KeyValueSorter by copying - /// elements and ordering from the given managed array - /// - /// @note This overload is needed to prevent ambiguity when implicit - /// casts are enabled - /// - /// @param[in] len - The number of elements to allocate space for - /// @param[in] arr - The managed array to copy elements from - /// - /// @return a KeyValueSorter instance - /////////////////////////////////////////////////////////////////////////// - KeyValueSorter(const size_t len, const host_device_ptr & arr) - : KeyValueSorter(len, host_device_ptr(arr)) - { - } - -#endif // defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /////////////////////////////////////////////////////////////////////////// /// @author Alan Dayton /// @brief (Shallow) Copy constructor @@ -758,32 +733,6 @@ class CARE_DLL_API KeyValueSorter { setKeyValueArraysFromManagedArray(m_keyValues, len, arr); } -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - - /////////////////////////////////////////////////////////////////////////// - /// @author Alan Dayton - /// - /// @brief Constructor - /// - /// Allocates space and initializes the KeyValueSorter by copying - /// elements and ordering from the given managed array - /// - /// @note This overload is needed to prevent ambiguity when implicit - /// casts are enabled - /// - /// @param[in] len - The number of elements to allocate space for - /// @param[in] arr - The managed array to copy elements from - /// - /// @return a KeyValueSorter instance - /// - /////////////////////////////////////////////////////////////////////////// - KeyValueSorter(const size_t len, const host_device_ptr & arr) - : KeyValueSorter(len, host_device_ptr(arr)) - { - } - -#endif // defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /////////////////////////////////////////////////////////////////////////// /// @author Alan Dayton /// @brief (Shallow) Copy constructor diff --git a/src/care/algorithm_decl.h b/src/care/algorithm_decl.h index 93765e21..11a200d4 100644 --- a/src/care/algorithm_decl.h +++ b/src/care/algorithm_decl.h @@ -250,16 +250,6 @@ CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr& array, c const bool allowDuplicates = false, const bool warnOnFailure = true); -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - -template -CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr& array, const int len, - const char* name, const char* argname, - const bool allowDuplicates = false, - const bool warnOnFailure = true); - -#endif // defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - template CARE_HOST_DEVICE CARE_DLL_API int BinarySearch(const mapType *map, const int start, diff --git a/src/care/algorithm_impl.h b/src/care/algorithm_impl.h index c3a42247..f77c38db 100644 --- a/src/care/algorithm_impl.h +++ b/src/care/algorithm_impl.h @@ -103,21 +103,6 @@ CARE_HOST_DEVICE CARE_INLINE bool checkSorted(const care::host_device_ptr(array.data(), len, name, argname, allowDuplicates, warnOnFailure); } -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - -template -CARE_HOST_DEVICE CARE_INLINE bool checkSorted(const care::host_device_ptr& array, - const int len, - const char* name, - const char* argname, - const bool allowDuplicates, - const bool warnOnFailure) -{ - return checkSorted(care::host_device_ptr(array), len, name, argname, allowDuplicates, warnOnFailure); -} - -#endif // defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /************************************************************************ * Function : IntersectArrays * Author(s) : Peter Robinson, based on IntersectGlobalIDArrays by Al Nichols diff --git a/src/care/care_inst.h b/src/care/care_inst.h index b8d8480b..d9d82775 100644 --- a/src/care/care_inst.h +++ b/src/care/care_inst.h @@ -100,21 +100,6 @@ CARE_EXTERN template CARE_DLL_API CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr&, const int, const char*, const char*, const bool, const bool) ; #endif -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - -CARE_EXTERN template CARE_DLL_API -CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr&, const int, const char*, const char*, const bool, const bool) ; -CARE_EXTERN template CARE_DLL_API -CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr&, const int, const char*, const char*, const bool, const bool) ; -CARE_EXTERN template CARE_DLL_API -CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr&, const int, const char*, const char*, const bool, const bool) ; -#if CARE_HAVE_LLNL_GLOBALID -CARE_EXTERN template CARE_DLL_API -CARE_HOST_DEVICE bool checkSorted(const care::host_device_ptr&, const int, const char*, const char*, const bool, const bool) ; -#endif - -#endif // defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /////////////////////////////////////////////////////////////////////////////// #ifdef CARE_PARALLEL_DEVICE diff --git a/src/care/config.h.in b/src/care/config.h.in index b03790ae..24e99a64 100644 --- a/src/care/config.h.in +++ b/src/care/config.h.in @@ -15,7 +15,6 @@ #cmakedefine CARE_ENABLE_BOUNDS_CHECKING #cmakedefine01 CARE_ENABLE_GPU_SIMULATION_MODE #cmakedefine CARE_NEVER_USE_RAJA_PARALLEL_SCAN -#cmakedefine CARE_ENABLE_IMPLICIT_CONVERSIONS #ifndef CARE_LEGACY_COMPATIBILITY_MODE #cmakedefine01 CARE_LEGACY_COMPATIBILITY_MODE #endif diff --git a/src/care/host_device_ptr.h b/src/care/host_device_ptr.h index 3dfe40f4..633d3a1d 100644 --- a/src/care/host_device_ptr.h +++ b/src/care/host_device_ptr.h @@ -97,22 +97,6 @@ namespace care { /// CARE_HOST_DEVICE host_device_ptr(std::nullptr_t from) noexcept : MA (from) {} -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /// - /// @author Peter Robinson - /// - /// Construct from a raw pointer - /// - /// @note Only safe if the raw pointer is already registered with CHAI - /// - template - CARE_HOST_DEVICE host_device_ptr( - T * from, //!< Raw pointer to construct from - chai::CHAIDISAMBIGUATE name=chai::CHAIDISAMBIGUATE(), //!< Used to disambiguate this constructor - bool foo=Q) //!< Used to disambiguate this constructor - : MA(from, name, foo) {} -#endif - /// /// @author Peter Robinson /// diff --git a/src/care/host_ptr.h b/src/care/host_ptr.h index 8b455199..65d2cb2b 100644 --- a/src/care/host_ptr.h +++ b/src/care/host_ptr.h @@ -123,15 +123,6 @@ namespace care { return m_ptr[index]; } -#if defined(CARE_ENABLE_IMPLICIT_CONVERSIONS) - /// - /// @author Peter Robinson - /// - /// Convert to a raw pointer - /// - operator T*() const { return m_ptr; } -#endif - /// /// @author Peter Robinson /// From daa9b83b8542cfd904f6b249d99af0ec909b4a91 Mon Sep 17 00:00:00 2001 From: Danny Taller <66029857+dtaller@users.noreply.github.com> Date: Wed, 4 Dec 2024 11:24:34 -0800 Subject: [PATCH 3/7] fix gpu errors (#306) * Explicitly define constructors instead of using the compiler generated ones (works around compiler bugs) --- src/care/host_device_map.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/care/host_device_map.h b/src/care/host_device_map.h index a63bb386..a24da555 100644 --- a/src/care/host_device_map.h +++ b/src/care/host_device_map.h @@ -66,7 +66,7 @@ namespace care { class host_device_map< key_type, mapped_type, RAJA::seq_exec> { public: // default constructor - host_device_map() noexcept = default; + host_device_map() noexcept {}; // constructor taking max number of entries host_device_map(size_t max_entries) : host_device_map{} { @@ -86,7 +86,15 @@ namespace care { } // copy constructor - host_device_map(host_device_map const & other) noexcept = default; + host_device_map(host_device_map const & other) noexcept : + m_map(other.m_map), + m_size(other.m_size), + m_iterator(other.m_iterator), + m_next_iterator_index(other.m_next_iterator_index), + m_max_size(other.m_max_size), + m_signal(other.m_signal) + { + } // move constructor host_device_map(host_device_map && other) noexcept { @@ -387,7 +395,7 @@ namespace care { { public: // default constructor - host_device_map() noexcept = default; + host_device_map() noexcept {}; // constructor host_device_map(size_t max_entries) : host_device_map{} { @@ -406,9 +414,16 @@ namespace care { } // copy constructor - host_device_map(host_device_map const & other) noexcept = default; + host_device_map(host_device_map const & other) noexcept : + m_size_ptr(other.m_size_ptr), + m_size(other.m_size), + m_map(other.m_map), + m_max_size(other.m_max_size), + m_signal(other.m_signal) + { + } - // move constructor + // move constructor host_device_map(host_device_map && other) noexcept { delete m_size_ptr; m_size_ptr = other.m_size_ptr; From 2cbb58cdc647a814aa238f5ae8becb71d4f9ff4a Mon Sep 17 00:00:00 2001 From: Alan Dayton <6393677+adayton1@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:57:21 -0800 Subject: [PATCH 4/7] Remove wrapper for make_managed_from_factory (#308) --- src/care/managed_ptr.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/care/managed_ptr.h b/src/care/managed_ptr.h index ea3b2a43..7e6dde1a 100644 --- a/src/care/managed_ptr.h +++ b/src/care/managed_ptr.h @@ -25,13 +25,6 @@ namespace care{ inline managed_ptr make_managed(Args&&... args) { return chai::make_managed(std::forward(args)...); } - - template - inline managed_ptr make_managed_from_factory(F&& f, Args&&... args) { - return chai::make_managed_from_factory(std::forward(f), std::forward(args)...); - } } #else // defined(CARE_ENABLE_MANAGED_PTR) From 68f189f0ea4167681368847f21ac7a2f8f8d15a9 Mon Sep 17 00:00:00 2001 From: Ben Liu <38140930+liu15@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:28:22 -0800 Subject: [PATCH 5/7] Fix for clang query (#309) --- src/care/host_device_map.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/care/host_device_map.h b/src/care/host_device_map.h index a24da555..8a41dc0f 100644 --- a/src/care/host_device_map.h +++ b/src/care/host_device_map.h @@ -284,7 +284,8 @@ namespace care { inline CARE_HOST_DEVICE mapped_type at(key_type key) const { int index = care::BinarySearch(m_gpu_map.keys(),0,m_size,key); if (index >= 0) { - return m_gpu_map.values()[index]; + const care::local_ptr& values = m_gpu_map.values(); + return values[index]; } else { return m_signal; @@ -351,13 +352,15 @@ namespace care { // lookups (valid after a sort() call) are done by binary searching the keys and using the // index of the located key to grab the appropriate value inline CARE_DEVICE mapped_type & value_at(int index) const { - return m_gpu_map.values()[index]; + const care::local_ptr& values = m_gpu_map.values(); + return values[index]; } // lookups (valid after a sort() call) are done by binary searching the keys and using the // index of the located key to grab the appropriate value inline CARE_DEVICE key_type const & key_at(int index) const { - return m_gpu_map.keys()[index]; + const care::local_ptr& keys = m_gpu_map.keys(); + return keys[index]; } inline CARE_DEVICE iterator iterator_at(int index) const { From b79bf2b31f78fc556923be10b5ef5f2556f0ad95 Mon Sep 17 00:00:00 2001 From: Alan Dayton <6393677+adayton1@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:18:17 -0800 Subject: [PATCH 6/7] Remove TSAN_ONLY_ATOMIC stuff (#310) --- RELEASE_NOTES.md | 3 --- cmake/SetupOptions.cmake | 2 -- src/care/atomic.h | 47 ---------------------------------------- src/care/config.h.in | 1 - 4 files changed, 53 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index d3e45687..0be796f2 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -16,10 +16,7 @@ The format of this file is based on [Keep a Changelog](http://keepachangelog.com ### Added - Added CARE\_DEEP\_COPY\_RAW\_PTR configuration option. - -### Added - Added ATOMIC\_SUB, ATOMIC\_LOAD, ATOMIC\_STORE, ATOMIC\_EXCHANGE, and ATOMIC\_CAS macros. -- Added TSAN\_ONLY\_ATOMIC\_\* macros to suppress tsan data race reports. Controlled by CARE\_ENABLE\_TSAN\_ONLY\_ATOMICS configuration option. ### Removed - Removed Accessor template parameter from host\_device\_ptr. diff --git a/cmake/SetupOptions.cmake b/cmake/SetupOptions.cmake index d70b436c..2ef13d5f 100644 --- a/cmake/SetupOptions.cmake +++ b/cmake/SetupOptions.cmake @@ -29,8 +29,6 @@ option(CARE_NEVER_USE_RAJA_PARALLEL_SCAN "Disable RAJA parallel scans in SCAN lo option(CARE_ENABLE_FUSER_BIN_32 "Enable the 32 register fusible loop bin." OFF) option(CARE_ENABLE_PARALLEL_LOOP_BACKWARDS "Reverse the start and end for parallel loops." OFF) option(CARE_ENABLE_STALE_DATA_CHECK "Enable checking for stale host data. Only applicable for GPU (or GPU simulation) builds." OFF) -# TODO: Investigate correctness and performance impact of this option -option(CARE_ENABLE_TSAN_ONLY_ATOMICS "Enable atomics for ThreadSanitizer (TSAN) build." OFF) # Extra components cmake_dependent_option(CARE_ENABLE_TESTS "Build CARE tests" diff --git a/src/care/atomic.h b/src/care/atomic.h index b4ab7efb..74f6aabc 100644 --- a/src/care/atomic.h +++ b/src/care/atomic.h @@ -25,51 +25,4 @@ using RAJAAtomic = RAJA::auto_atomic; #define ATOMIC_EXCHANGE(ref, val) RAJA::atomicExchange(&(ref), val) #define ATOMIC_CAS(ref, compare, val) RAJA::atomicCAS(&(ref), compare, val) -/// -/// Macros that use atomics for a ThreadSanitizer build to avoid false -/// positives, but otherwise do a non-atomic operation (for cases where -/// the order of execution does not matter, such as multiple threads -/// setting the same variable to the same value). -/// -/// WARNING: The returned previous value for the TSAN_ONLY_ATOMIC_* macros -/// should generally not be used in a parallel context, since -/// another thread may have modified the value at the given memory -/// location in between the current thread's read and write. If the -/// return value is needed, use the ATOMIC_* macros instead. -/// -/// TODO: Evaluate whether the compiler actually does the right thing without -/// atomics and whether using atomics detracts from performance. -/// -#if defined(CARE_ENABLE_TSAN_ONLY_ATOMICS) - -#define TSAN_ONLY_ATOMIC_ADD(ref, inc) ATOMIC_ADD(ref, inc) -#define TSAN_ONLY_ATOMIC_SUB(ref, inc) ATOMIC_SUB(ref, inc) -#define TSAN_ONLY_ATOMIC_MIN(ref, val) ATOMIC_MIN(ref, val) -#define TSAN_ONLY_ATOMIC_MAX(ref, val) ATOMIC_MAX(ref, val) -#define TSAN_ONLY_ATOMIC_OR(ref, val) ATOMIC_OR(ref, val) -#define TSAN_ONLY_ATOMIC_AND(ref, val) ATOMIC_AND(ref, val) -#define TSAN_ONLY_ATOMIC_XOR(ref, val) ATOMIC_XOR(ref, val) -#define TSAN_ONLY_ATOMIC_LOAD(ref) ATOMIC_LOAD(ref) -#define TSAN_ONLY_ATOMIC_STORE(ref, val) ATOMIC_STORE(ref, val) -#define TSAN_ONLY_ATOMIC_EXCHANGE(ref, val) ATOMIC_EXCHANGE(ref, val) -#define TSAN_ONLY_ATOMIC_CAS(ref, compare, val) ATOMIC_CAS(ref, compare, val) - -#else - -using TSANOnlyAtomic = RAJA::seq_atomic; - -#define TSAN_ONLY_ATOMIC_ADD(ref, inc) RAJA::atomicAdd(&(ref), inc) -#define TSAN_ONLY_ATOMIC_SUB(ref, inc) RAJA::atomicSub(&(ref), inc) -#define TSAN_ONLY_ATOMIC_MIN(ref, val) RAJA::atomicMin(&(ref), val) -#define TSAN_ONLY_ATOMIC_MAX(ref, val) RAJA::atomicMax(&(ref), val) -#define TSAN_ONLY_ATOMIC_OR(ref, val) RAJA::atomicOr(&(ref), val) -#define TSAN_ONLY_ATOMIC_AND(ref, val) RAJA::atomicAnd(&(ref), val) -#define TSAN_ONLY_ATOMIC_XOR(ref, val) RAJA::atomicXor(&(ref), val) -#define TSAN_ONLY_ATOMIC_LOAD(ref) RAJA::atomicLoad(&(ref)) -#define TSAN_ONLY_ATOMIC_STORE(ref, val) RAJA::atomicStore(&(ref), val) -#define TSAN_ONLY_ATOMIC_EXCHANGE(ref, val) RAJA::atomicExchange(&(ref), val) -#define TSAN_ONLY_ATOMIC_CAS(ref, compare, val) RAJA::atomicCAS(&(ref), compare, val) - -#endif - #endif // CARE_ATOMIC_H diff --git a/src/care/config.h.in b/src/care/config.h.in index 24e99a64..c26353ea 100644 --- a/src/care/config.h.in +++ b/src/care/config.h.in @@ -27,7 +27,6 @@ #cmakedefine01 CARE_ENABLE_PINNED_MEMORY_FOR_SCANS #cmakedefine CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU #cmakedefine CARE_ENABLE_STALE_DATA_CHECK -#cmakedefine CARE_ENABLE_TSAN_ONLY_ATOMICS // Optional dependencies #cmakedefine01 CARE_HAVE_LLNL_GLOBALID From 8a054d390b80c630054f093f2a872c2de2b147e1 Mon Sep 17 00:00:00 2001 From: Ben Liu <38140930+liu15@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:40:32 -0800 Subject: [PATCH 7/7] Add additional REDUCE macros (#297) * Add additional REDUCE macros * Use gpu_reduce instead of parallel_reduce --------- Co-authored-by: Alan Dayton <6393677+adayton1@users.noreply.github.com> --- src/care/DefaultMacros.h | 7 ++- src/care/LoopFuser.h | 45 +++++++++++++ src/care/forall.h | 9 ++- src/care/policies.h | 4 +- src/care/scan.h | 132 +++++++++++++++++++++++++++++++++++---- 5 files changed, 180 insertions(+), 17 deletions(-) diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 2b31ee8e..0066cf1b 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -736,7 +736,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK) #define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \ - CARE_CHECKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHECK) + CARE_CHECKED_POLICY_LOOP_START(care::gpu_reduce,INDEX, START_INDEX, END_INDEX, CHECK) #define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK) @@ -771,7 +771,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _ #define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) #define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \ - CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) + CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::gpu_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) #define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK) @@ -1278,6 +1278,9 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _ launch_2D_jagged(care::gpu{}, XSTART, XEND, XLENGTHS.data(chai::DEFAULT, true), YSTART, YLENGTH, __FILE__, __LINE__, [=] CARE_DEVICE (int XINDEX, int YINDEX)->void { #define CARE_LOOP_2D_STREAM_JAGGED_END }); +#define CARE_LOOP_2D_REDUCE_JAGGED(XINDEX, XSTART, XEND, XLENGTHS, YINDEX, YSTART, YLENGTH, FLAT_INDEX) \ + launch_2D_jagged(care::gpu_reduce{}, XSTART, XEND, XLENGTHS.data(chai::DEFAULT, true), YSTART, YLENGTH, __FILE__, __LINE__, [=] CARE_DEVICE (int XINDEX, int YINDEX)->void { +#define CARE_LOOP_2D_REDUCE_JAGGED_END }); #endif // !defined(_CARE_DEFAULT_MACROS_H_) diff --git a/src/care/LoopFuser.h b/src/care/LoopFuser.h index 76f7aa8e..13bf8db6 100644 --- a/src/care/LoopFuser.h +++ b/src/care/LoopFuser.h @@ -1273,9 +1273,14 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_STREAM_R_END \ } }); } FUSIBLE_FLUSH_IF_NEEDED +#define FUSIBLE_REDUCE_LOOP_R FUSIBLE_LOOP_STREAM_R +#define FUSIBLE_REDUCE_LOOP_R_END FUSIBLE_LOOP_STREAM_R_END + #define FUSIBLE_LOOP_STREAM(INDEX, START, END) FUSIBLE_LOOP_STREAM_R(INDEX, START, END, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT) #define FUSIBLE_LOOP_STREAM_END FUSIBLE_LOOP_STREAM_R_END +#define FUSIBLE_REDUCE_LOOP FUSIBLE_LOOP_STREAM +#define FUSIBLE_REDUCE_LOOP_END FUSIBLE_LOOP_STREAM_END #define FUSIBLE_KERNEL_R(REGISTER_COUNT) { \ auto __fuser__ = LOOPFUSER(REGISTER_COUNT)::getInstance(); \ @@ -1304,9 +1309,14 @@ void LoopFuser::registerAction(const char * fileName, } \ }); }} +#define FUSIBLE_REDUCE_LOOP_PHASE_R FUSIBLE_LOOP_PHASE_R +#define FUSIBLE_REDUCE_LOOP_PHASE_R_END FUSIBLE_LOOP_PHASE_R_END + #define FUSIBLE_LOOP_PHASE(INDEX, START, END, PRIORITY) FUSIBLE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT) #define FUSIBLE_LOOP_PHASE_END FUSIBLE_LOOP_PHASE_R_END +#define FUSIBLE_REDUCE_LOOP_PHASE FUSIBLE_LOOP_PHASE +#define FUSIBLE_REDUCE_LOOP_PHASE_END FUSIBLE_LOOP_PHASE_END #define FUSIBLE_KERNEL_PHASE_R(PRIORITY, REGISTER_COUNT) { \ LOOPFUSER(REGISTER_COUNT) * __fuser__ = FusedActionsObserver::getActiveObserver()->getFusedActions(PRIORITY); \ @@ -1382,11 +1392,17 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) \ FUSIBLE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT) +#define FUSIBLE_REDUCE_LOOP_SCAN_R FUSIBLE_LOOP_SCAN_R +#define FUSIBLE_REDUCE_LOOP_SCAN FUSIBLE_LOOP_SCAN + #define _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) } return 0; }, 1, POS_STORE_DESTINATION); } #define FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) FUSIBLE_FLUSH_IF_NEEDED #define FUSIBLE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_R_END FUSIBLE_LOOP_SCAN_R_END +#define FUSIBLE_REDUCE_LOOP_SCAN_END FUSIBLE_LOOP_SCAN_END + #define FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \ _FUSIBLE_LOOP_SCAN_R(FusedActionsObserver::getActiveObserver()->getFusedActions(PRIORITY), \ INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT) @@ -1394,9 +1410,14 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) \ FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R FUSIBLE_LOOP_SCAN_PHASE_R +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE FUSIBLE_LOOP_SCAN_PHASE + #define FUSIBLE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) #define FUSIBLE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R_END FUSIBLE_LOOP_SCAN_PHASE_R_END +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_END FUSIBLE_LOOP_SCAN_PHASE_END // note - FUSED_SCANVAR will be nullptr if we are not recording, as there will be no need for an intermediate // FUSED_SCANVAR, so we won't need to write to it in the action or store into it in the conditional @@ -1466,6 +1487,8 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_STREAM_R(INDEX, START, END, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END) #define FUSIBLE_LOOP_STREAM(INDEX, START, END) CARE_STREAM_LOOP(INDEX, START, END) +#define FUSIBLE_REDUCE_LOOP_R(INDEX, START, END, REGISTER_COUNT) CARE_REDUCE_LOOP(INDEX, START, END) +#define FUSIBLE_REDUCE_LOOP(INDEX, START, END) CARE_REDUCE_LOOP(INDEX, START, END) #define FUSIBLE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END) #define FUSIBLE_LOOP_PHASE(INDEX, START, END, PRIORITY) CARE_STREAM_LOOP(INDEX, START, END) @@ -1473,6 +1496,12 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_PHASE_END CARE_STREAM_LOOP_END #define FUSIBLE_LOOP_PHASE_R_END CARE_STREAM_LOOP_END +#define FUSIBLE_REDUCE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END) +#define FUSIBLE_REDUCE_LOOP_PHASE(INDEX, START, END, PRIORITY) CARE_STREAM_LOOP(INDEX, START, END) + +#define FUSIBLE_REDUCE_LOOP_PHASE_END CARE_REDUCE_LOOP_END +#define FUSIBLE_REDUCE_LOOP_PHASE_R_END CARE_REDUCE_LOOP_END + #define FUSIBLE_FLUSH_IF_NEEDED #define FUSIBLE_PHASE_RESET @@ -1485,6 +1514,9 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_STREAM_R_END CARE_STREAM_LOOP_END #define FUSIBLE_LOOP_STREAM_END CARE_STREAM_LOOP_END +#define FUSIBLE_REDUCE_LOOP_R_END CARE_REDUCE_LOOP_END +#define FUSIBLE_REDUCE_LOOP_END CARE_REDUCE_LOOP_END + #define FUSIBLE_KERNEL_PHASE_R_END CARE_PARALLEL_KERNEL_END #define FUSIBLE_KERNEL_R_END CARE_PARALLEL_KERNEL_END #define FUSIBLE_KERNEL_PHASE_END CARE_PARALLEL_KERNEL_END @@ -1501,16 +1533,29 @@ void LoopFuser::registerAction(const char * fileName, #define FUSIBLE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) #define FUSIBLE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) +#define FUSIBLE_REDUCE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) +#define FUSIBLE_REDUCE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) + #define FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) #define FUSIBLE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) + #define FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \ SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) #define FUSIBLE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \ + SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) + #define FUSIBLE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) #define FUSIBLE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) +#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION) + #define FUSIBLE_FREE(A) A.free() #define FUSIBLE_FREE_DEVICE(A,DEST,ELEM) care::wrappedFreeDeviceMemory(A, DEST, ELEM); diff --git a/src/care/forall.h b/src/care/forall.h index b7b79502..fb60ae47 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -265,7 +265,7 @@ namespace care { /// /// @brief Execute using the care::RAJAReductionExec policy /// - /// @arg[in] parallel_reduce Used to choose this overload of forall + /// @arg[in] gpu_reduce Used to choose this overload of forall /// @arg[in] fileName The name of the file where this function is called /// @arg[in] lineNumber The line number in the file where this function is called /// @arg[in] start The starting index (inclusive) @@ -275,7 +275,7 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// template - void forall(parallel_reduce, const char * fileName, const int lineNumber, + void forall(gpu_reduce, const char * fileName, const int lineNumber, const int start, const int end, const int batch_size, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; @@ -627,6 +627,11 @@ namespace care { arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE); } } + + template + void launch_2D_jagged(care::gpu_reduce, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) { + launch_2D_jagged(care::gpu{}, xstart, xend, gpu_lengths, ystart, ylength, fileName, lineNumber, body) ; + } #endif } // namespace care diff --git a/src/care/policies.h b/src/care/policies.h index b134c5b0..4eac22e3 100644 --- a/src/care/policies.h +++ b/src/care/policies.h @@ -14,8 +14,8 @@ namespace care { struct sequential {}; struct openmp {}; struct gpu {}; + struct gpu_reduce {}; struct parallel {}; - struct parallel_reduce {}; struct raja_fusible {}; struct raja_fusible_seq {}; struct managed_ptr_read {}; @@ -27,8 +27,8 @@ namespace care { sequential, openmp, gpu, + gpu_reduce, parallel, - parallel_reduce, managed_ptr_read, managed_ptr_write }; diff --git a/src/care/scan.h b/src/care/scan.h index 15c378b3..0d5e9d3b 100644 --- a/src/care/scan.h +++ b/src/care/scan.h @@ -342,6 +342,20 @@ using ScanVarGID = chai::ManagedArray; } CARE_CHECKED_SEQUENTIAL_LOOP_END(scan_loop_init_check) \ } +#define SCAN_REDUCE_LOOP_INIT(INDX, START, END, SCANVAR, SCANVARLENGTH, SCANVAR_OFFSET, EXPR) \ + if (END - START > 0) { \ + int const SCANVARENDNAME(SCANVAR) = END; \ + CARE_CHECKED_REDUCE_LOOP_START(INDX, START, END+1, scan_reduce_loop_init_check) { \ + SCANVAR[INDX-START] = (INDX != SCANVARENDNAME(SCANVAR)) && (EXPR) ; \ + } CARE_CHECKED_REDUCE_LOOP_END(scan_reduce_loop_init_check) \ + care::exclusive_scan(RAJAExec{}, SCANVAR, nullptr, END-START+1, SCANVAR_OFFSET, true); \ + } else { \ + CARE_CHECKED_SEQUENTIAL_LOOP_START(INDX, 0, 1, scan_reduce_loop_init_check) { \ + SCANVAR[INDX] = SCANVAR_OFFSET; \ + SCANVARLENGTH[0] = SCANVAR_OFFSET; \ + } CARE_CHECKED_SEQUENTIAL_LOOP_END(scan_reduce_loop_init_check) \ + } + #if CARE_HAVE_LLNL_GLOBALID #define SCAN_LOOP_GID_INIT(INDX, START, END, SCANVAR, SCANVARLENGTH, SCANVAR_OFFSET, EXPR) \ @@ -358,6 +372,20 @@ using ScanVarGID = chai::ManagedArray; } CARE_CHECKED_SEQUENTIAL_LOOP_END(scan_loop_gid_init_check) \ } +#define SCAN_REDUCE_LOOP_GID_INIT(INDX, START, END, SCANVAR, SCANVARLENGTH, SCANVAR_OFFSET, EXPR) \ + if (END - START > 0) { \ + int const SCANVARENDNAME(SCANVAR) = END; \ + CARE_CHECKED_REDUCE_LOOP_START(INDX, START, END+1, scan_reduce_loop_gid_init_check) { \ + SCANVAR[INDX-START] = (INDX != SCANVARENDNAME(SCANVAR)) && (EXPR) ; \ + } CARE_CHECKED_REDUCE_LOOP_END(scan_reduce_loop_gid_init_check) \ + care::exclusive_scan(RAJAExec{}, SCANVAR, nullptr, END-START+1, SCANVAR_OFFSET.Value(), true); \ + } else { \ + CARE_CHECKED_SEQUENTIAL_LOOP_START(INDX, 0, 1, scan_reduce_loop_gid_init_check) { \ + SCANVAR[INDX] = SCANVAR_OFFSET.Value(); \ + SCANVARLENGTH[0] = SCANVAR_OFFSET.Value(); \ + } CARE_CHECKED_SEQUENTIAL_LOOP_END(scan_reduce_loop_gid_init_check) \ + } + #endif // CARE_HAVE_LLNL_GLOBALID #define MANAGED_PTR_SCAN_LOOP_INIT(INDX, START, END, SCANVAR, SCANVARLENGTH, SCANVAR_OFFSET, EXPR) \ @@ -391,13 +419,6 @@ using ScanVarGID = chai::ManagedArray; const int SCANINDX = SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]; \ if (SCANINDX != SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1]) { -#define SCAN_LOOP_END(END, SCANINDX, SCANLENGTH) } \ - } CARE_CHECKED_PARALLEL_LOOP_END(scan_loop_check) \ - SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ - SCANVARNAME(SCANINDX).free(); \ - SCANVARLENGTHNAME(SCANINDX).free(); \ - } - #define SCAN_LOOP_64(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ { \ int const SCANVARSTARTNAME(SCANINDX) = START; \ @@ -405,13 +426,69 @@ using ScanVarGID = chai::ManagedArray; ScanVar64 SCANVARLENGTHNAME(SCANINDX)(1, CARE_SCANVARLENGTHNAME_SPACE); \ SCAN_LOOP_INIT(INDX, SCANVARSTARTNAME(SCANINDX), END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ int const SCANVARENDNAME(SCANINDX) = END; \ - CARE_CHECKED_PARALLEL_LOOP_START(INDX, START, END, scan_loop_check) { \ + CARE_CHECKED_PARALLEL_LOOP_START(INDX, START, END, scan_loop_64_check) { \ if (INDX == SCANVARENDNAME(SCANINDX) -1) { \ SCANVARLENGTHNAME(SCANINDX)[0] = SCANVARNAME(SCANINDX)[SCANVARENDNAME(SCANINDX)-START]; \ } \ const size_t SCANINDX = SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]; \ if (SCANINDX != SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1]) { +#define SCAN_REDUCE_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ + { \ + int const SCANVARSTARTNAME(SCANINDX) = START; \ + ScanVar SCANVARNAME(SCANINDX)(END-START+1); \ + ScanVar SCANVARLENGTHNAME(SCANINDX)(1, CARE_SCANVARLENGTHNAME_SPACE); \ + SCAN_REDUCE_LOOP_INIT(INDX, SCANVARSTARTNAME(SCANINDX), END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ + int const SCANVARENDNAME(SCANINDX) = END; \ + CARE_CHECKED_REDUCE_LOOP_START(INDX, START, END, scan_reduce_loop_check) { \ + if (INDX == SCANVARENDNAME(SCANINDX) -1) { \ + SCANVARLENGTHNAME(SCANINDX)[0] = SCANVARNAME(SCANINDX)[SCANVARENDNAME(SCANINDX)-START]; \ + } \ + const int SCANINDX = SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]; \ + if (SCANINDX != SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1]) { + +#define SCAN_REDUCE_LOOP_64(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ + { \ + int const SCANVARSTARTNAME(SCANINDX) = START; \ + ScanVar64 SCANVARNAME(SCANINDX)(END-START+1); \ + ScanVar64 SCANVARLENGTHNAME(SCANINDX)(1, CARE_SCANVARLENGTHNAME_SPACE); \ + SCAN_REDUCE_LOOP_INIT(INDX, SCANVARSTARTNAME(SCANINDX), END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ + int const SCANVARENDNAME(SCANINDX) = END; \ + CARE_CHECKED_PARALLEL_LOOP_START(INDX, START, END, scan_reduce_loop_64_check) { \ + if (INDX == SCANVARENDNAME(SCANINDX) -1) { \ + SCANVARLENGTHNAME(SCANINDX)[0] = SCANVARNAME(SCANINDX)[SCANVARENDNAME(SCANINDX)-START]; \ + } \ + const size_t SCANINDX = SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]; \ + if (SCANINDX != SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1]) { + +#define SCAN_LOOP_END(END, SCANINDX, SCANLENGTH) } \ + } CARE_CHECKED_PARALLEL_LOOP_END(scan_loop_check) \ + SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ + SCANVARNAME(SCANINDX).free(); \ + SCANVARLENGTHNAME(SCANINDX).free(); \ + } + +#define SCAN_LOOP_64_END(END, SCANINDX, SCANLENGTH) } \ + } CARE_CHECKED_PARALLEL_LOOP_END(scan_loop_64_check) \ + SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ + SCANVARNAME(SCANINDX).free(); \ + SCANVARLENGTHNAME(SCANINDX).free(); \ + } + +#define SCAN_REDUCE_LOOP_END(END, SCANINDX, SCANLENGTH) } \ + } CARE_CHECKED_REDUCE_LOOP_END(scan_reduce_loop_check) \ + SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ + SCANVARNAME(SCANINDX).free(); \ + SCANVARLENGTHNAME(SCANINDX).free(); \ + } + +#define SCAN_REDUCE_LOOP_64_END(END, SCANINDX, SCANLENGTH) } \ + } CARE_CHECKED_REDUCE_LOOP_END(scan_reduce_loop_64_check) \ + SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ + SCANVARNAME(SCANINDX).free(); \ + SCANVARLENGTHNAME(SCANINDX).free(); \ + } + #if CARE_HAVE_LLNL_GLOBALID #define SCAN_LOOP_GID(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ @@ -428,6 +505,20 @@ using ScanVarGID = chai::ManagedArray; const globalID SCANINDX = globalID(SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]); \ if (SCANINDX != globalID(SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1])) { +#define SCAN_REDUCE_LOOP_GID(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ + { \ + int const SCANVARSTARTNAME(SCANINDX) = START; \ + ScanVarGID SCANVARNAME(SCANINDX)(END-START+1); \ + ScanVarGID SCANVARLENGTHNAME(SCANINDX)(1, CARE_SCANVARLENGTHNAME_SPACE); \ + SCAN_REDUCE_LOOP_GID_INIT(INDX, SCANVARSTARTNAME(SCANINDX), END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ + int const SCANVARENDNAME(SCANINDX) = END; \ + CARE_CHECKED_REDUCE_LOOP_START(INDX, START, END, scan_reduce_loop_gid_check) { \ + if (INDX == SCANVARENDNAME(SCANINDX)-1) { \ + SCANVARLENGTHNAME(SCANINDX)[0] = SCANVARNAME(SCANINDX)[SCANVARENDNAME(SCANINDX)-START]; \ + } \ + const globalID SCANINDX = globalID(SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)]); \ + if (SCANINDX != globalID(SCANVARNAME(SCANINDX)[INDX-SCANVARSTARTNAME(SCANINDX)+1])) { + #define SCAN_LOOP_GID_END(END, SCANINDX, SCANLENGTH) } \ } CARE_CHECKED_PARALLEL_LOOP_END(scan_loop_gid_check) \ SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH.Ref()) \ @@ -435,6 +526,13 @@ using ScanVarGID = chai::ManagedArray; SCANVARLENGTHNAME(SCANINDX).free(); \ } +#define SCAN_REDUCE_LOOP_GID_END(END, SCANINDX, SCANLENGTH) } \ + } CARE_CHECKED_REDUCE_LOOP_END(scan_reduce_loop_gid_check) \ + SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH.Ref()) \ + SCANVARNAME(SCANINDX).free(); \ + SCANVARLENGTHNAME(SCANINDX).free(); \ + } + #endif // CARE_HAVE_LLNL_GLOBALID #define SCAN_EVERYWHERE_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ @@ -461,16 +559,16 @@ using ScanVarGID = chai::ManagedArray; { \ ScanVar SCANVARNAME(SCANINDX)(END-START+1); \ ScanVar SCANVARLENGTHNAME(SCANINDX)(1, CARE_SCANVARLENGTHNAME_SPACE); \ - SCAN_LOOP_INIT(INDX, START, END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ + SCAN_REDUCE_LOOP_INIT(INDX, START, END, SCANVARNAME(SCANINDX), SCANVARLENGTHNAME(SCANINDX), SCANINDX_OFFSET, EXPR); \ int const SCANVARENDNAME(SCANINDX) = END; \ - CARE_CHECKED_PARALLEL_LOOP_START(INDX, START, END, scaneverywhere_reduce_loop_check) { \ + CARE_CHECKED_REDUCE_LOOP_START(INDX, START, END, scaneverywhere_reduce_loop_check) { \ if (INDX == SCANVARENDNAME(SCANINDX)-1) { \ SCANVARLENGTHNAME(SCANINDX)[0] = SCANVARNAME(SCANINDX)[SCANVARENDNAME(SCANINDX)-START]; \ } \ const int SCANINDX = SCANVARNAME(SCANINDX)[INDX-START]; #define SCAN_EVERYWHERE_REDUCE_LOOP_END(END, SCANINDX, SCANLENGTH) \ - } CARE_CHECKED_PARALLEL_LOOP_END(scaneverywhere_reduce_loop_check) \ + } CARE_CHECKED_REDUCE_LOOP_END(scaneverywhere_reduce_loop_check) \ SCAN_LOOP_FINAL(END, SCANVARLENGTHNAME(SCANINDX), SCANLENGTH) \ SCANVARNAME(SCANINDX).free(); \ SCANVARLENGTHNAME(SCANINDX).free(); \ @@ -513,9 +611,15 @@ using ScanVarGID = chai::ManagedArray; #define SCAN_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ SCAN_LOOP_P(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) +#define SCAN_REDUCE_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ + SCAN_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) + #define SCAN_LOOP_END(END, SCANINDX, SCANLENGTH) \ SCAN_LOOP_P_END(END, SCANINDX, SCANLENGTH) +#define SCAN_REDUCE_LOOP_END(END, SCANINDX, SCANLENGTH) \ + SCAN_LOOP_END(END, SCANINDX, SCANLENGTH) + #if CARE_HAVE_LLNL_GLOBALID #define SCAN_LOOP_GID(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ @@ -525,11 +629,17 @@ using ScanVarGID = chai::ManagedArray; if (EXPR) { \ const globalID SCANINDX = SCANVARNAME(SCANINDX)++; +#define SCAN_REDUCE_LOOP_GID(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \ + SCAN_REDUCE_LOOP_GID(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) + #define SCAN_LOOP_GID_END(END, SCANINDX, SCANLENGTH) } \ } CARE_CHECKED_SEQUENTIAL_LOOP_WITH_REF_END(scan_loop_gid_check) \ SCANLENGTH = SCANVARNAME(SCANINDX); \ } +#define SCAN_REDUCE_LOOP_GID_END(END, SCANINDX, SCANLENGTH) } \ + SCAN_REDUCE_LOOP_GID_END(END, SCANINDX, SCANLENGTH) } + #endif // CARE_HAVE_LLNL_GLOBALID #define SCAN_EVERYWHERE_LOOP(INDX, START, END, SCANINDX, SCANINDX_OFFSET, EXPR) \