[fix] set explicit row-major order (#1054)

* explicitly set row-major MPI rank distribution in the blacs grid * update test_wf_inner
electronic-structure · Feb 14, 2025 · daee52b · daee52b
1 parent dd0bc0f
commit daee52b
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 39 deletions.
diff --git a/apps/mini_app/CMakeLists.txt b/apps/mini_app/CMakeLists.txt
@@ -22,6 +22,7 @@ set(SIRIUS_SCF_FLAGS_cpu_band_parallel    --control.processing_unit=cpu --contro
 # todo: Add OMP_NUM_THREADS + srun / mpiexec flags here too?
 
 if(BUILD_TESTING)
+    set(CTEST_OUTPUT_ON_FAILURE 1)
     file(GLOB dirs LIST_DIRECTORIES true "${CMAKE_SOURCE_DIR}/verification/test*")
 
     foreach(full_path ${dirs})

diff --git a/apps/tests/test_wf_inner.cpp b/apps/tests/test_wf_inner.cpp
@@ -15,6 +15,8 @@ int
 test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_bands__, int bs__, memory_t mem__)
 {
     spla::Context spla_ctx(is_host_memory(mem__) ? SPLA_PU_HOST : SPLA_PU_GPU);
+    /* we have plenty of gpu memory, allow a larger tile size */
+    spla_ctx.set_tile_size_gpu(2096);
 
     std::unique_ptr<la::BLACS_grid> blacs_grid;
     if (mpi_grid_dims__[0] * mpi_grid_dims__[1] == 1) {
@@ -38,12 +40,14 @@ test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_ba
 
     auto sr = wf::spin_range(0, 2);
 
+    double pref = 1.0 / std::sqrt(gvec->num_gvec());
+
     for (auto s = sr.begin(); s != sr.end(); s++) {
         for (int i = 0; i < num_bands__; i++) {
             for (int igloc = 0; igloc < gvec->count(); igloc++) {
                 int ig                                      = igloc + gvec->offset();
-                phi1.pw_coeffs(igloc, s, wf::band_index(i)) = static_cast<double>(i + 1) / (ig + 1);
-                phi2.pw_coeffs(igloc, s, wf::band_index(i)) = static_cast<double>(ig + 1) / (i + 1) / gvec->num_gvec();
+                phi1.pw_coeffs(igloc, s, wf::band_index(i)) = pref * (i + 1) / (ig + 1);
+                phi2.pw_coeffs(igloc, s, wf::band_index(i)) = pref * (ig + 1) / (i + 1);
             }
         }
     }
@@ -58,33 +62,48 @@ test_wf_inner_impl(std::vector<int> mpi_grid_dims__, double cutoff__, int num_ba
               0);
     mpi::Communicator::world().barrier();
 
-    double t = -wtime();
-    wf::inner(spla_ctx, mem__, sr, phi1, wf::band_range(0, num_bands__), phi2, wf::band_range(0, num_bands__), ovlp, 0,
-              0);
-    mpi::Communicator::world().barrier();
-    t += wtime();
-
-    double perf = sr.size() * 8e-9 * num_bands__ * num_bands__ * gvec->num_gvec() / t;
-    if (mpi::Communicator::world().rank() == 0) {
-        printf("execution time (sec) : %12.6f\n", t);
-        printf("performance (GFlops) : %12.6f\n", perf);
-    }
+    Measurement stat;
 
-    double max_diff{0};
-    for (int j = 0; j < ovlp.num_cols_local(); j++) {
-        auto jcol = ovlp.icol(j);
-        for (int i = 0; i < ovlp.num_rows_local(); i++) {
-            auto irow = ovlp.irow(i);
-            /* 2 is accumulated from two spins */
-            std::complex<double> z = ovlp(i, j) - 2 * static_cast<double>(irow + 1) / (jcol + 1);
-            max_diff               = std::max(max_diff, std::abs(z));
+    int ierr{0};
+    for (int k = 0; k < 4; k++) {
+        if (mpi::Communicator::world().rank() == 0) {
+            std::cout << "step " << k << std::endl;
+        }
+        double t = -wtime();
+        wf::inner(spla_ctx, mem__, sr, phi1, wf::band_range(0, num_bands__), phi2, wf::band_range(0, num_bands__), ovlp,
+                  0, 0);
+        mpi::Communicator::world().barrier();
+        t += wtime();
+        double perf = sr.size() * 8e-9 * num_bands__ * num_bands__ * gvec->num_gvec() / t;
+        stat.push_back(perf);
+        if (mpi::Communicator::world().rank() == 0) {
+            std::cout << "execution time : " << t << " sec." << std::endl;
+            std::cout << "performance : " << perf << " GFlops" << ", " << perf / mpi::Communicator::world().size()
+                      << " GFlops/rank" << std::endl;
+        }
+        double max_diff{0};
+        for (int j = 0; j < ovlp.num_cols_local(); j++) {
+            auto jcol = ovlp.icol(j);
+            for (int i = 0; i < ovlp.num_rows_local(); i++) {
+                auto irow = ovlp.irow(i);
+                /* factor 1 or 2 is accumulated from spin components */
+                auto z   = ovlp(i, j) - sr.size() * static_cast<double>(irow + 1) / (jcol + 1);
+                max_diff = std::max(max_diff, std::abs(z));
+            }
+        }
+        mpi::Communicator::world().allreduce<double, mpi::op_t::max>(&max_diff, 1);
+        if (mpi::Communicator::world().rank() == 0) {
+            std::cout << "max diff : " << max_diff << std::endl;
+        }
+        if (max_diff > 1e-8) {
+            ierr++;
         }
     }
-    mpi::Communicator::world().reduce<double, mpi::op_t::max>(&max_diff, 1, 0);
-    if (max_diff > 1e-10) {
-        return 1;
+    if (mpi::Communicator::world().rank() == 0) {
+        std::cout << "average performance (GFlops) : " << stat.average() << ", sigma : " << stat.sigma() << std::endl;
     }
-    return 0;
+
+    return ierr;
 }
 
 int

diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp
@@ -3029,7 +3029,6 @@ sirius_get_energy(void* const* gs_handler__, char const* label__, double* energy
 
                 auto& kset      = gs.k_point_set();
                 auto& ctx       = kset.ctx();
-                auto& unit_cell = kset.unit_cell();
                 auto& potential = gs.potential();
                 auto& density   = gs.density();
 

diff --git a/src/core/la/dmatrix.cpp b/src/core/la/dmatrix.cpp
@@ -34,9 +34,9 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
     , blacs_grid_(&blacs_grid__)
     , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
     , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
-    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
-              blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
-              blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
+    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
+                                                                     blacs_grid__.num_ranks_row(),
+                                                                     blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
 {
     init();
 }
@@ -58,9 +58,9 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const&
     , blacs_grid_(&blacs_grid__)
     , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
     , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
-    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
-              blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
-              blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
+    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
+                                                                     blacs_grid__.num_ranks_row(),
+                                                                     blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
 {
     init();
 }
@@ -119,9 +119,9 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
     , blacs_grid_(&blacs_grid__)
     , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
     , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
-    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
-              blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
-              blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
+    , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic(blacs_grid__.comm().native(), 'R',
+                                                                     blacs_grid__.num_ranks_row(),
+                                                                     blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
 {
     init();
 }

diff --git a/src/potential/generate_d_mtrx.cpp b/src/potential/generate_d_mtrx.cpp
@@ -65,9 +65,7 @@ Potential::generate_d_mtrx()
         if (!atom_type.augment() || atom_type.num_atoms() == 0) {
             for (int iv = 0; iv < ctx_.num_mag_dims() + 1; iv++) {
                 for (int i = 0; i < atom_type.num_atoms(); i++) {
-                    int ia     = atom_type.atom_id(i);
-                    auto& atom = unit_cell_.atom(ia);
-
+                    int const ia = atom_type.atom_id(i);
                     for (int xi2 = 0; xi2 < nbf; xi2++) {
                         for (int xi1 = 0; xi1 < nbf; xi1++) {
                             d_mtrx_[ia](xi1, xi2, iv) = 0;

diff --git a/src/unit_cell/atom_type.hpp b/src/unit_cell/atom_type.hpp
@@ -398,7 +398,7 @@ class Atom_type
     /// Add radial function of the augmentation charge.
     /** Radial functions of beta projectors must be added already. Their total number will be used to
         deterimine the storage size for the radial functions of the augmented charge. */
-    inline void
+    void
     add_q_radial_function(int idxrf1__, int idxrf2__, int l__, std::vector<double> qrf__);
 
     /// Set the radial grid of the given type.