Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use hip_occ_calc raja policy for CARE_REDUCE loops with hip. #256

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions src/care/DefaultMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o

#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)

#define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \
CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_PARALLEL_LOOP_END(CHECK)

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -545,17 +550,28 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
#define CARE_SET_THREAD_ID(INDEX)
#endif

#define CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) { \
#define CARE_CHECKED_POLICY_LOOP_START(POLICY, INDEX, START_INDEX, END_INDEX, CHECK) { \
auto _care_checked_loop_end = END_INDEX; \
decltype(_care_checked_loop_end) _care_checked_loop_begin = START_INDEX; \
if (_care_checked_loop_end > _care_checked_loop_begin) { \
CARE_NEST_BEGIN(CHECK) \
care::forall(care::parallel{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \
care::forall(POLICY{}, __FILE__, __LINE__, _care_checked_loop_begin, _care_checked_loop_end, [=] CARE_DEVICE (decltype(_care_checked_loop_end) INDEX) { \
CARE_SET_THREAD_ID(INDEX)

#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
#define CARE_CHECKED_POLICY_LOOP_END(CHECK) }); \
CARE_NEST_END(CHECK) }}

#define CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \
CARE_CHECKED_POLICY_LOOP_START(care::parallel,INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK)

#define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \
CARE_CHECKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK)


////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -830,9 +846,9 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_begin_ndx; INDEX < _care_o
/// @arg[in] END_INDEX The ending index (exclusive)
///
////////////////////////////////////////////////////////////////////////////////
#define CARE_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX) CARE_CHECKED_PARALLEL_LOOP_START(INDEX, START_INDEX, END_INDEX, care_reduce_loop_check)
#define CARE_REDUCE_LOOP(INDEX, START_INDEX, END_INDEX) CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, care_reduce_loop_check)

#define CARE_REDUCE_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_reduce_loop_check)
#define CARE_REDUCE_LOOP_END CARE_CHECKED_REDUCE_LOOP_END(care_reduce_loop_check)

////////////////////////////////////////////////////////////////////////////////
///
Expand Down
35 changes: 35 additions & 0 deletions src/care/forall.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ namespace care {
struct ExecutionPolicyToSpace<RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
static constexpr const chai::ExecutionSpace value = chai::GPU;
};
template <>
struct ExecutionPolicyToSpace<RAJAReductionExec> {
static constexpr const chai::ExecutionSpace value = chai::GPU;
};
#endif

#if CARE_ENABLE_GPU_SIMULATION_MODE
Expand Down Expand Up @@ -228,6 +232,37 @@ namespace care {
#endif
PluginData::setParallelContext(false);

#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = false;
#endif
}

////////////////////////////////////////////////////////////////////////////////
///
/// @author Peter Robinson
///
/// @brief Execute using the care::RAJAReductionExec policy
///
/// @arg[in] parallel_reducew Used to choose this overload of forall
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] start The starting index (inclusive)
/// @arg[in] end The ending index (exclusive)
/// @arg[in] body The loop body to execute at each index
///
////////////////////////////////////////////////////////////////////////////////
template <typename LB>
void forall(parallel_reduce, const char * fileName, const int lineNumber,
const int start, const int end, LB&& body) {
#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = true;
#endif
PluginData::setParallelContext(true);

forall(RAJAReductionExec{}, fileName, lineNumber, start, end, std::forward<LB>(body));

PluginData::setParallelContext(false);

#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = false;
#endif
Expand Down
8 changes: 8 additions & 0 deletions src/care/policies.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace care {
struct openmp {};
struct gpu {};
struct parallel {};
struct parallel_reduce {};
struct raja_fusible {};
struct raja_fusible_seq {};
struct managed_ptr_read {};
Expand All @@ -27,6 +28,7 @@ namespace care {
openmp,
gpu,
parallel,
parallel_reduce,
managed_ptr_read,
managed_ptr_write
};
Expand Down Expand Up @@ -65,6 +67,12 @@ using RAJADeviceExec = RAJA::seq_exec;

#endif // CARE_GPUCC

// reduction kernel policy
#if defined(__HIPCC__)
using RAJAReductionExec = RAJA::hip_exec_occ_calc<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>;
#else
using RAJAReductionExec = RAJADeviceExec;
#endif



Expand Down
Loading