From 9db29e8982befdfcfefa9b2606dd89d279ff088f Mon Sep 17 00:00:00 2001 From: GengLiang Date: Fri, 13 Dec 2024 17:48:24 +0800 Subject: [PATCH] [mthreads]base/toolkits: recovery MPI_intra test code --- .../mthreads/S4000/bandwidth.mu | 80 +++++++++++++++---- .../mthreads/S4000/main.sh | 3 +- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu index f55952504..148b9a948 100644 --- a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu +++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu @@ -1,10 +1,14 @@ #include #include +#include +#include +#include +#include #define GB (1024ULL * 1024ULL * 1024ULL) -#define SIZE (16ULL * GB) +#define SIZE (4ULL * GB) #define WARMUP_ITERATIONS 100 -#define ITERATIONS 1000 +#define ITERATIONS 10000 void checkMusaError(musaError_t err, const char* msg) { if (err != musaSuccess) { @@ -13,41 +17,85 @@ void checkMusaError(musaError_t err, const char* msg) { } } +void checkMcclError(mcclResult_t result, const char* msg) { + if (result != mcclSuccess) { + fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result)); + exit(EXIT_FAILURE); + } +} + int main() { - float* d_src, * d_dst; + const int num_gpus = 8; + int devs[num_gpus] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + musaEvent_t start, end; float elapsed_time; + std::vector d_src(num_gpus); + std::vector d_dst(num_gpus); + std::vector comms(num_gpus); + std::vector streams(num_gpus); - checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost"); - checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + for (int i = 0; i < num_gpus; ++i) { + checkMusaError(musaSetDevice(devs[i]), "musaSetDevice"); + checkMusaError(musaMalloc(&d_src[i], SIZE), "musaMalloc"); + checkMusaError(musaMalloc(&d_dst[i], SIZE), "musaMalloc"); + checkMusaError(musaMemset(d_src[i], 1.0f, SIZE), "musaMemset"); + checkMusaError(musaStreamCreate(&streams[i]), "musaStreamCreate"); + } + checkMcclError(mcclCommInitAll(comms.data(), num_gpus, devs), "mcclCommInitAll"); checkMusaError(musaEventCreate(&start), "musaEventCreate"); checkMusaError(musaEventCreate(&end), "musaEventCreate"); - for (int i = 0; i < WARMUP_ITERATIONS; ++i) { - checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + checkMcclError(mcclGroupStart(), "mcclGroupStart"); + for (int j = 0; j < num_gpus; ++j) { + checkMcclError(mcclAllReduce((const void*)d_src[j], (void*)d_dst[j], SIZE / sizeof(float), mcclFloat, mcclSum, comms[j], streams[j]), "mcclAllReduce"); + } + checkMcclError(mcclGroupEnd(), "mcclGroupEnd"); + for (int j = 0; j < num_gpus; ++j) { + checkMusaError(musaStreamSynchronize(streams[j]), "musaStreamSynchronize"); + } } checkMusaError(musaEventRecord(start), "musaEventRecord"); for (int i = 0; i < ITERATIONS; ++i) { - checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + checkMcclError(mcclGroupStart(), "mcclGroupStart"); + for (int j = 0; j < num_gpus; ++j) { + checkMcclError(mcclAllReduce((const void*)d_src[j], (void*)d_dst[j], SIZE / sizeof(float), mcclFloat, mcclSum, comms[j], streams[j]), "mcclAllReduce"); + } + checkMcclError(mcclGroupEnd(), "mcclGroupEnd"); + for (int j = 0; j < num_gpus; ++j) { + checkMusaError(musaStreamSynchronize(streams[j]), "musaStreamSynchronize"); + } } - checkMusaError(musaEventRecord(end), "musaEventRecord"); checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); - checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime"); - double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0); + double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0); + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-algbw=" + << std::fixed << std::setprecision(2) << algbw / (1024.0 * 1024.0 * 1024.0) + << "GiB/s" << std::endl; - printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); - printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-algbw=" + << std::fixed << std::setprecision(2) << algbw / (1000.0 * 1000.0 * 1000.0) + << "GB/s" << std::endl; + double bandwidth = algbw * (2.0 * (num_gpus - 1) / num_gpus); + bandwidth = bandwidth + bandwidth; + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) + << "GiB/s" << std::endl; - checkMusaError(musaFreeHost(d_src), "musaFreeHost"); - checkMusaError(musaFree(d_dst), "musaFree"); + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) + << "GB/s" << std::endl; + for (int i = 0; i < num_gpus; ++i) { + checkMusaError(musaFree(d_src[i]), "musaFree"); + checkMusaError(musaFree(d_dst[i]), "musaFree"); + checkMcclError(mcclCommDestroy(comms[i]), "mcclCommDestroy"); + } checkMusaError(musaEventDestroy(start), "musaEventDestroy"); checkMusaError(musaEventDestroy(end), "musaEventDestroy"); - return 0; } \ No newline at end of file diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh index 102ab5bc3..3cfff19f4 100644 --- a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh +++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh @@ -1,2 +1,3 @@ -mcc bandwidth.mu -o bdtest -lmusart +export MCCL_PROTOS=2 +mcc bandwidth.mu -o bdtest -lmusart -lmccl ./bdtest \ No newline at end of file