diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 0f58821f..62c48072 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -267,6 +267,11 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +#define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \ + CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_PARALLEL_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -545,17 +550,28 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o #define CARE_SET_THREAD_ID(INDEX) #endif -#define CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \ +#define CARE_CHECKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \ auto _care_checked_loop_end = END_INDEX; \ decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \ if (_care_checked_loop_end > _care_checked_loop_begin) { \ CARE_NEST_BEGIN(CHECK) \ - care::forall(care::parallel{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \ + care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \ CARE_SET_THREAD_ID(INDEX) -#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \ +#define CARE_CHECKED_POLICY_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +#define CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \ + CARE_CHECKED_POLICY_LOOP_START(care::parallel,INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK) + +#define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \ + CARE_CHECKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK) + + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -830,9 +846,9 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o /// @arg[in] END_INDEX The ending index (exclusive) /// //////////////////////////////////////////////////////////////////////////////// -#define CARE_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX) CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, care_reduce_loop_check) +#define CARE_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX) CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, care_reduce_loop_check) -#define CARE_REDUCE_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_reduce_loop_check) +#define CARE_REDUCE_LOOP_END CARE_CHECKED_REDUCE_LOOP_END(care_reduce_loop_check) //////////////////////////////////////////////////////////////////////////////// /// diff --git a/src/care/forall.h b/src/care/forall.h index ea2ad41b..22813b1e 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -46,6 +46,10 @@ namespace care { struct ExecutionPolicyToSpace> { static constexpr const chai::ExecutionSpace value = chai::GPU; }; + template <> + struct ExecutionPolicyToSpace { + static constexpr const chai::ExecutionSpace value = chai::GPU; + }; #endif #if CARE_ENABLE_GPU_SIMULATION_MODE @@ -228,6 +232,37 @@ namespace care { #endif PluginData::setParallelContext(false); +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = false; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// + /// @author Peter Robinson + /// + /// @brief Execute using the care::RAJAReductionExec policy + /// + /// @arg[in] parallel_reducew Used to choose this overload of forall + /// @arg[in] fileName The name of the file where this function is called + /// @arg[in] lineNumber The line number in the file where this function is called + /// @arg[in] start The starting index (inclusive) + /// @arg[in] end The ending index (exclusive) + /// @arg[in] body The loop body to execute at each index + /// + //////////////////////////////////////////////////////////////////////////////// + template + void forall(parallel_reduce, const char * fileName, const int lineNumber, + const int start, const int end, LB&& body) { +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = true; +#endif + PluginData::setParallelContext(true); + + forall(RAJAReductionExec{}, fileName, lineNumber, start, end, std::forward(body)); + + PluginData::setParallelContext(false); + #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = false; #endif diff --git a/src/care/policies.h b/src/care/policies.h index 23eae95d..3252ff3c 100644 --- a/src/care/policies.h +++ b/src/care/policies.h @@ -15,6 +15,7 @@ namespace care { struct openmp {}; struct gpu {}; struct parallel {}; + struct parallel_reduce {}; struct raja_fusible {}; struct raja_fusible_seq {}; struct managed_ptr_read {}; @@ -27,6 +28,7 @@ namespace care { openmp, gpu, parallel, + parallel_reduce, managed_ptr_read, managed_ptr_write }; @@ -65,6 +67,12 @@ using RAJADeviceExec = RAJA::seq_exec; #endif // CARE_GPUCC +// reduction kernel policy +#if defined(__HIPCC__) +using RAJAReductionExec = RAJA::hip_exec_occ_calc; +#else +using RAJAReductionExec = RAJADeviceExec; +#endif