Skip to content

Commit

Permalink
[fix] set explicit row-major order (#1054)
Browse files Browse the repository at this point in the history
* explicitly set row-major MPI rank distribution in the blacs grid
* update test_wf_inner
  • Loading branch information
toxa81 authored Feb 14, 2025
1 parent dd0bc0f commit daee52b
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 39 deletions.
1 change: 1 addition & 0 deletions apps/mini_app/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ set(SIRIUS_SCF_FLAGS_cpu_band_parallel --control.processing_unit=cpu --contro
# todo: Add OMP_NUM_THREADS + srun / mpiexec flags here too?

if(BUILD_TESTING)
set(CTEST_OUTPUT_ON_FAILURE 1)
file(GLOB dirs LIST_DIRECTORIES true "${CMAKE_SOURCE_DIR}/verification/test*")

foreach(full_path ${dirs})
Expand Down
69 changes: 44 additions & 25 deletions apps/tests/test_wf_inner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ int
test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_bands__, int bs__, memory_t mem__)
{
spla::Context spla_ctx(is_host_memory(mem__) ? SPLA_PU_HOST : SPLA_PU_GPU);
/* we have plenty of gpu memory, allow a larger tile size */
spla_ctx.set_tile_size_gpu(2096);

std::unique_ptr<la::BLACS_grid> blacs_grid;
if (mpi_grid_dims__[0] * mpi_grid_dims__[1] == 1) {
Expand All @@ -38,12 +40,14 @@ test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_ba

auto sr = wf::spin_range(0, 2);

double pref = 1.0 / std::sqrt(gvec->num_gvec());

for (auto s = sr.begin(); s != sr.end(); s++) {
for (int i = 0; i < num_bands__; i++) {
for (int igloc = 0; igloc < gvec->count(); igloc++) {
int ig = igloc + gvec->offset();
phi1.pw_coeffs(igloc, s, wf::band_index(i)) = static_cast<double>(i + 1) / (ig + 1);
phi2.pw_coeffs(igloc, s, wf::band_index(i)) = static_cast<double>(ig + 1) / (i + 1) / gvec->num_gvec();
phi1.pw_coeffs(igloc, s, wf::band_index(i)) = pref * (i + 1) / (ig + 1);
phi2.pw_coeffs(igloc, s, wf::band_index(i)) = pref * (ig + 1) / (i + 1);
}
}
}
Expand All @@ -58,33 +62,48 @@ test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_ba
0);
mpi::Communicator::world().barrier();

double t = -wtime();
wf::inner(spla_ctx, mem__, sr, phi1, wf::band_range(0, num_bands__), phi2, wf::band_range(0, num_bands__), ovlp, 0,
0);
mpi::Communicator::world().barrier();
t += wtime();

double perf = sr.size() * 8e-9 * num_bands__ * num_bands__ * gvec->num_gvec() / t;
if (mpi::Communicator::world().rank() == 0) {
printf("execution time (sec) : %12.6f\n", t);
printf("performance (GFlops) : %12.6f\n", perf);
}
Measurement stat;

double max_diff{0};
for (int j = 0; j < ovlp.num_cols_local(); j++) {
auto jcol = ovlp.icol(j);
for (int i = 0; i < ovlp.num_rows_local(); i++) {
auto irow = ovlp.irow(i);
/* 2 is accumulated from two spins */
std::complex<double> z = ovlp(i, j) - 2 * static_cast<double>(irow + 1) / (jcol + 1);
max_diff = std::max(max_diff, std::abs(z));
int ierr{0};
for (int k = 0; k < 4; k++) {
if (mpi::Communicator::world().rank() == 0) {
std::cout << "step " << k << std::endl;
}
double t = -wtime();
wf::inner(spla_ctx, mem__, sr, phi1, wf::band_range(0, num_bands__), phi2, wf::band_range(0, num_bands__), ovlp,
0, 0);
mpi::Communicator::world().barrier();
t += wtime();
double perf = sr.size() * 8e-9 * num_bands__ * num_bands__ * gvec->num_gvec() / t;
stat.push_back(perf);
if (mpi::Communicator::world().rank() == 0) {
std::cout << "execution time : " << t << " sec." << std::endl;
std::cout << "performance : " << perf << " GFlops" << ", " << perf / mpi::Communicator::world().size()
<< " GFlops/rank" << std::endl;
}
double max_diff{0};
for (int j = 0; j < ovlp.num_cols_local(); j++) {
auto jcol = ovlp.icol(j);
for (int i = 0; i < ovlp.num_rows_local(); i++) {
auto irow = ovlp.irow(i);
/* factor 1 or 2 is accumulated from spin components */
auto z = ovlp(i, j) - sr.size() * static_cast<double>(irow + 1) / (jcol + 1);
max_diff = std::max(max_diff, std::abs(z));
}
}
mpi::Communicator::world().allreduce<double, mpi::op_t::max>(&max_diff, 1);
if (mpi::Communicator::world().rank() == 0) {
std::cout << "max diff : " << max_diff << std::endl;
}
if (max_diff > 1e-8) {
ierr++;
}
}
mpi::Communicator::world().reduce<double, mpi::op_t::max>(&max_diff, 1, 0);
if (max_diff > 1e-10) {
return 1;
if (mpi::Communicator::world().rank() == 0) {
std::cout << "average performance (GFlops) : " << stat.average() << ", sigma : " << stat.sigma() << std::endl;
}
return 0;

return ierr;
}

int
Expand Down
1 change: 0 additions & 1 deletion src/api/sirius_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3029,7 +3029,6 @@ sirius_get_energy(void* const* gs_handler__, char const* label__, double* energy

auto& kset = gs.k_point_set();
auto& ctx = kset.ctx();
auto& unit_cell = kset.unit_cell();
auto& potential = gs.potential();
auto& density = gs.density();

Expand Down
18 changes: 9 additions & 9 deletions src/core/la/dmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
, blacs_grid_(&blacs_grid__)
, spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
, spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
{
init();
}
Expand All @@ -58,9 +58,9 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const&
, blacs_grid_(&blacs_grid__)
, spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
, spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
{
init();
}
Expand Down Expand Up @@ -119,9 +119,9 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
, blacs_grid_(&blacs_grid__)
, spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
, spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
, spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
blacs_grid__.num_ranks_row(),
blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
{
init();
}
Expand Down
4 changes: 1 addition & 3 deletions src/potential/generate_d_mtrx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ Potential::generate_d_mtrx()
if (!atom_type.augment() || atom_type.num_atoms() == 0) {
for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
for (int i = 0; i < atom_type.num_atoms(); i++) {
int ia = atom_type.atom_id(i);
auto& atom = unit_cell_.atom(ia);

int const ia = atom_type.atom_id(i);
for (int xi2 = 0; xi2 < nbf; xi2++) {
for (int xi1 = 0; xi1 < nbf; xi1++) {
d_mtrx_[ia](xi1, xi2, iv) = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/unit_cell/atom_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ class Atom_type
/// Add radial function of the augmentation charge.
/** Radial functions of beta projectors must be added already. Their total number will be used to
deterimine the storage size for the radial functions of the augmented charge. */
inline void
void
add_q_radial_function(int idxrf1__, int idxrf2__, int l__, std::vector<double> qrf__);

/// Set the radial grid of the given type.
Expand Down

0 comments on commit daee52b

Please sign in to comment.