diff --git a/tests/fuzz/device/sync/barrier_kernel.h b/tests/fuzz/device/sync/barrier_kernel.h index 33eaacf23ad65c3c3fe5ca6546670803e8cb47a4..f87321779632bd9d49f28df772b3b256f857c10c 100644 --- a/tests/fuzz/device/sync/barrier_kernel.h +++ b/tests/fuzz/device/sync/barrier_kernel.h @@ -12,9 +12,23 @@ void increase_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); void increase_vec_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void increase_odd_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void increase_vec_odd_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void increase_do_part_pes(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); +void increase_vec_do_part_pes(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); void increase_do_odd_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size, shmem_team_t team_id); void increase_vec_do_odd_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size, shmem_team_t team_id); +void increase_do_half_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); +void increase_vec_do_half_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); +void increase_do_one_pe_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); +void increase_vec_do_one_pe_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id); #endif // BARRIER_KERNEL_H \ No newline at end of file diff --git a/tests/fuzz/device/sync/p2p_kernel.h b/tests/fuzz/device/sync/p2p_kernel.h index 6ea55d59c29413fed67c204c2b9259528d66d30d..59cebb92f227421d6a1bcef3873eb4ec1d367f55 100644 --- a/tests/fuzz/device/sync/p2p_kernel.h +++ b/tests/fuzz/device/sync/p2p_kernel.h @@ -11,5 +11,10 @@ #define P2P_KERNEL_H void p2p_chain_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void p2p_chain_cmp_ne_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void p2p_chain_cmp_gt_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void p2p_chain_cmp_ge_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void p2p_chain_cmp_lt_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); +void p2p_chain_cmp_le_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size); #endif \ No newline at end of file diff --git a/tests/unittest/device/sync/barrier/barrier_kernel.cpp b/tests/unittest/device/sync/barrier/barrier_kernel.cpp index f115a1c7a80798a46b065c58a3183bdeba754761..b80ac02083952fdce1fba30471aa21883f5b15f7 100644 --- a/tests/unittest/device/sync/barrier/barrier_kernel.cpp +++ b/tests/unittest/device/sync/barrier/barrier_kernel.cpp @@ -42,6 +42,81 @@ extern "C" SHMEM_GLOBAL void increase_vec(uint64_t config, GM_ADDR addr, int ran #endif } +extern "C" SHMEM_GLOBAL void increase_odd(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_CUBE__ + // scalar unit of cube core is not affected by barrier + shmem_barrier_all(); + shmem_barrier_all(); +#endif + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + + shmem_barrier_all(); + if (rank_id & 1) { + GM_ADDR remote = shmemi_ptr(addr, (rank_id + 2) % rank_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + + shmem_barrier_all(); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_vec_odd(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + + shmemx_barrier_all_vec(); + if (rank_id & 1) { + GM_ADDR remote = shmemi_ptr(addr, (rank_id + 2) % rank_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmemx_barrier_all_vec(); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_part_pes(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_CUBE__ + // scalar unit of cube core is not affected by barrier + shmem_barrier_all(); + shmem_barrier_all(); +#endif + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + + shmem_barrier(team_id); + if (rank_id & 1) { + GM_ADDR remote = shmemi_ptr(addr, (rank_id + 2) % rank_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmem_barrier(team_id); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_vec_part_pes(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + + shmemx_barrier_vec(team_id); + if (rank_id & 1) { + GM_ADDR remote = shmemi_ptr(addr, (rank_id + 2) % rank_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmemx_barrier_vec(team_id); +#endif +} + extern "C" SHMEM_GLOBAL void increase_odd_team(uint64_t config, GM_ADDR addr, int rank_id, int rank_size, shmem_team_t team_id) { shmemx_set_ffts_config(config); @@ -80,6 +155,86 @@ extern "C" SHMEM_GLOBAL void increase_vec_odd_team(uint64_t config, GM_ADDR addr #endif } +extern "C" SHMEM_GLOBAL void increase_half_team(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_CUBE__ + // scalar unit of cube core is not affected by barrier + shmem_barrier_all(); + shmem_barrier_all(); +#endif + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + int team_size = rank_size / 2; + + shmem_barrier(team_id); + if (rank_id >= 1 && rank_id - 1 < team_size) { + GM_ADDR remote = shmemi_ptr(addr, 1 + (rank_id + 1) % team_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmem_barrier(team_id); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_vec_half_team(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + int team_size = rank_size / 2; + + shmemx_barrier_vec(team_id); + if (rank_id >= 1 && rank_id - 1 < team_size) { + GM_ADDR remote = shmemi_ptr(addr, 1 + (rank_id + 1) % team_size); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmemx_barrier_vec(team_id); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_one_pe_team(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_CUBE__ + // scalar unit of cube core is not affected by barrier + shmem_barrier_all(); + shmem_barrier_all(); +#endif + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + int team_size = shmem_team_n_pes(team_id); + + shmem_barrier(team_id); + if (rank_id == 1) { + GM_ADDR remote = shmemi_ptr(addr, rank_id); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmem_barrier(team_id); +#endif +} + +extern "C" SHMEM_GLOBAL void increase_vec_one_pe_team(uint64_t config, GM_ADDR addr, int rank_id, + int rank_size, shmem_team_t team_id) { + shmemx_set_ffts_config(config); + +#ifdef __DAV_C220_VEC__ + uint64_t val = shmemi_load((__gm__ uint64_t *)addr); + int team_size = shmem_team_n_pes(team_id); + + shmemx_barrier_vec(team_id); + if (rank_id == 1) { + GM_ADDR remote = shmemi_ptr(addr, rank_id); + shmemi_store((__gm__ uint64_t *)remote, val + 1); + } + shmemx_barrier_vec(team_id); +#endif +} + void increase_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) { increase<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size); @@ -90,6 +245,28 @@ void increase_vec_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, increase_vec<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size); } +void increase_odd_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + increase_odd<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void increase_vec_odd_do(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + increase_vec_odd<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void increase_do_part_pes(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_part_pes<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + +void increase_vec_do_part_pes(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_vec_part_pes<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + void increase_do_odd_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size, shmem_team_t team_id) { @@ -100,4 +277,28 @@ void increase_vec_do_odd_team(void* stream, uint64_t config, uint8_t *addr, int int rank_size, shmem_team_t team_id) { increase_vec_odd_team<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + +void increase_do_half_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_half_team<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + +void increase_vec_do_half_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_vec_half_team<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + +void increase_do_one_pe_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_one_pe_team<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); +} + +void increase_vec_do_one_pe_team(void* stream, uint64_t config, uint8_t *addr, int rank_id, + int rank_size, shmem_team_t team_id) +{ + increase_vec_one_pe_team<<<16, nullptr, stream>>>(config, addr, rank_id, rank_size, team_id); } \ No newline at end of file diff --git a/tests/unittest/device/sync/p2p/p2p_kernel.cpp b/tests/unittest/device/sync/p2p/p2p_kernel.cpp index 21acbd4c49ce42cb0f47aa505b79533099f66f3f..4be5c001e2bdc182ccf45a11813f8f19e940bfac 100644 --- a/tests/unittest/device/sync/p2p/p2p_kernel.cpp +++ b/tests/unittest/device/sync/p2p/p2p_kernel.cpp @@ -43,7 +43,139 @@ extern "C" SHMEM_GLOBAL void p2p_chain(uint64_t config, GM_ADDR addr, int rank_i shmem_barrier_all(); } +extern "C" SHMEM_GLOBAL void p2p_chain_cmp_ne(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) +{ + shmemx_set_ffts_config(config); + auto sig_addr = (__gm__ int32_t *)addr; + int next = (rank_id + 1) % rank_size; + + shmem_barrier_all(); + +#ifdef __DAV_C220_VEC__ + if (rank_id == 0) { + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + shmem_signal_wait_until(sig_addr, SHMEM_CMP_NE, 0); + } else { + shmem_signal_wait_until(sig_addr, SHMEM_CMP_NE, 0); + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + } +#endif + + shmem_barrier_all(); +} + +extern "C" SHMEM_GLOBAL void p2p_chain_cmp_gt(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) +{ + shmemx_set_ffts_config(config); + auto sig_addr = (__gm__ int32_t *)addr; + int next = (rank_id + 1) % rank_size; + + shmem_barrier_all(); + +#ifdef __DAV_C220_VEC__ + if (rank_id == 0) { + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + shmem_signal_wait_until(sig_addr, SHMEM_CMP_GT, 0); + } else { + shmem_signal_wait_until(sig_addr, SHMEM_CMP_GT, 0); + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + } +#endif + + shmem_barrier_all(); +} + +extern "C" SHMEM_GLOBAL void p2p_chain_cmp_ge(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) +{ + shmemx_set_ffts_config(config); + auto sig_addr = (__gm__ int32_t *)addr; + int next = (rank_id + 1) % rank_size; + + shmem_barrier_all(); + +#ifdef __DAV_C220_VEC__ + if (rank_id == 0) { + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + shmem_signal_wait_until(sig_addr, SHMEM_CMP_GE, 1); + } else { + shmem_signal_wait_until(sig_addr, SHMEM_CMP_GE, 1); + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + } +#endif + + shmem_barrier_all(); +} + +extern "C" SHMEM_GLOBAL void p2p_chain_cmp_lt(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) +{ + shmemx_set_ffts_config(config); + auto sig_addr = (__gm__ int32_t *)addr; + int next = (rank_id + 1) % rank_size; + + *sig_addr = 10; + shmem_barrier_all(); + +#ifdef __DAV_C220_VEC__ + if (rank_id == 0) { + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + shmem_signal_wait_until(sig_addr, SHMEM_CMP_LT, 5); + } else { + shmem_signal_wait_until(sig_addr, SHMEM_CMP_LT, 5); + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + } +#endif + + shmem_barrier_all(); +} + +extern "C" SHMEM_GLOBAL void p2p_chain_cmp_le(uint64_t config, GM_ADDR addr, int rank_id, int rank_size) +{ + shmemx_set_ffts_config(config); + auto sig_addr = (__gm__ int32_t *)addr; + int next = (rank_id + 1) % rank_size; + + *sig_addr = 10; + shmem_barrier_all(); + +#ifdef __DAV_C220_VEC__ + if (rank_id == 0) { + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + shmem_signal_wait_until(sig_addr, SHMEM_CMP_LE, 5); + } else { + shmem_signal_wait_until(sig_addr, SHMEM_CMP_LE, 5); + shmemx_signal_op(sig_addr, 1, SHMEM_SIGNAL_SET, next); + } +#endif + + shmem_barrier_all(); +} + void p2p_chain_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) { p2p_chain<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void p2p_chain_cmp_ne_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + p2p_chain_cmp_ne<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void p2p_chain_cmp_gt_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + p2p_chain_cmp_gt<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void p2p_chain_cmp_ge_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + p2p_chain_cmp_ge<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void p2p_chain_cmp_lt_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + p2p_chain_cmp_lt<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); +} + +void p2p_chain_cmp_le_do(void *stream, uint64_t config, uint8_t *addr, int rank_id, int rank_size) +{ + p2p_chain_cmp_le<<<1, nullptr, stream>>>(config, addr, rank_id, rank_size); } \ No newline at end of file diff --git a/tests/unittest/host/main_test.cpp b/tests/unittest/host/main_test.cpp index a91dbaba20a880f3c19bf83266054db255bee07c..8b14b1522c367c756e76b510c49e5e72f10f5b8a 100644 --- a/tests/unittest/host/main_test.cpp +++ b/tests/unittest/host/main_test.cpp @@ -88,10 +88,16 @@ void test_mutil_task(std::function func, uint64_t loca std::cout << "fork failed ! " << pids[i] << std::endl; } else if (pids[i] == 0) { func(i + test_first_rank, test_global_ranks, local_mem_size); - exit(0); + if (::testing::Test::HasFailure()) { + _exit(1); + } + _exit(0); } } for (int i = 0; i < process_count; ++i) { + if (pids[i] <= 0) { + continue; + } waitpid(pids[i], &status[i], 0); if (WIFEXITED(status[i]) && WEXITSTATUS(status[i]) != 0) { FAIL(); diff --git a/tests/unittest/host/sync/barrier/barrier_host_test.cpp b/tests/unittest/host/sync/barrier/barrier_host_test.cpp index fbf71afcdef241c2ecbd3eab456b948b8ad75ec3..b9edda89eda5964e19773ab9665c200aeefd1865 100644 --- a/tests/unittest/host/sync/barrier/barrier_host_test.cpp +++ b/tests/unittest/host/sync/barrier/barrier_host_test.cpp @@ -17,7 +17,7 @@ #include "unittest_main_test.h" #include "barrier_kernel.h" -constexpr int32_t SHMEM_BARRIER_TEST_NUM = 3; +constexpr int32_t SHMEM_BARRIER_TEST_NUM = 100; static void test_barrier_black_box(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) { @@ -31,7 +31,7 @@ static void test_barrier_black_box(int32_t rank_id, int32_t n_ranks, uint64_t lo uint64_t *addr_host; ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host), sizeof(uint64_t)), 0); - for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM / 2; i++) { std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; increase_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks); ASSERT_EQ(aclrtSynchronizeStream(stream), 0); @@ -40,12 +40,23 @@ static void test_barrier_black_box(int32_t rank_id, int32_t n_ranks, uint64_t lo shm::shmemi_control_barrier_all(); } + for (int32_t i = SHMEM_BARRIER_TEST_NUM / 2 + 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_odd_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + if (rank_id & 1) { + ASSERT_EQ(aclrtMemcpy(addr_host, sizeof(uint64_t), addr_dev, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), 0); + ASSERT_EQ((*addr_host), i); + } + shm::shmemi_control_barrier_all(); + } + uint64_t *addr_dev_vec = static_cast(shmem_malloc(sizeof(uint64_t))); ASSERT_EQ(aclrtMemset(addr_dev_vec, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); uint64_t *addr_host_vec; ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host_vec), sizeof(uint64_t)), 0); - for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM / 2; i++) { std::cout << "[TEST] vec barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; increase_vec_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev_vec, rank_id, n_ranks); ASSERT_EQ(aclrtSynchronizeStream(stream), 0); @@ -55,15 +66,23 @@ static void test_barrier_black_box(int32_t rank_id, int32_t n_ranks, uint64_t lo shm::shmemi_control_barrier_all(); } + for (int32_t i = SHMEM_BARRIER_TEST_NUM / 2 + 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_vec_odd_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev_vec, rank_id, n_ranks); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + if (rank_id & 1) { + ASSERT_EQ(aclrtMemcpy(addr_host_vec, sizeof(uint64_t), addr_dev_vec, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), 0); + ASSERT_EQ((*addr_host_vec), i); + } + shm::shmemi_control_barrier_all(); + } + ASSERT_EQ(aclrtFreeHost(addr_host), 0); shmem_free(addr_dev); ASSERT_EQ(aclrtFreeHost(addr_host_vec), 0); shmem_free(addr_dev_vec); test_finalize(stream, device_id); - if (::testing::Test::HasFailure()) { - exit(1); - } } static void test_barrier_black_box_odd_team(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) @@ -121,9 +140,181 @@ static void test_barrier_black_box_odd_team(int32_t rank_id, int32_t n_ranks, ui shmem_team_destroy(team_odd); test_finalize(stream, device_id); - if (::testing::Test::HasFailure()) { - exit(1); +} + +static void test_barrier_black_box_team_part_pes(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) +{ + int32_t device_id = rank_id % test_gnpu_num + test_first_npu; + aclrtStream stream; + test_init(rank_id, n_ranks, local_mem_size, &stream); + ASSERT_NE(stream, nullptr); + + shmem_team_t team; + int start = 1; + int stride = 1; + int team_size = n_ranks - 1; + shmem_team_split_strided(SHMEM_TEAM_WORLD, start, stride, team_size, &team); + + uint64_t *addr_dev = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host), sizeof(uint64_t)), 0); + + uint64_t *addr_dev_vec = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev_vec, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host_vec; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host_vec), sizeof(uint64_t)), 0); + + if (rank_id != 0) { + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_do_part_pes(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks, team); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + if (rank_id & 1) { + ASSERT_EQ(aclrtMemcpy(addr_host, sizeof(uint64_t), addr_dev, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host), i); + } + shm::shmemi_control_barrier_all(); + } + + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] vec barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_vec_do_part_pes(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev_vec, rank_id, n_ranks, + team); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + if (rank_id & 1) { + ASSERT_EQ( + aclrtMemcpy(addr_host_vec, sizeof(uint64_t), addr_dev_vec, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host_vec), i); + } + shm::shmemi_control_barrier_all(); + } + } + + ASSERT_EQ(aclrtFreeHost(addr_host), 0); + shmem_free(addr_dev); + ASSERT_EQ(aclrtFreeHost(addr_host_vec), 0); + shmem_free(addr_dev_vec); + + shmem_team_destroy(team); + + test_finalize(stream, device_id); +} + +static void test_barrier_black_box_half_team(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) +{ + int32_t device_id = rank_id % test_gnpu_num + test_first_npu; + aclrtStream stream; + test_init(rank_id, n_ranks, local_mem_size, &stream); + ASSERT_NE(stream, nullptr); + + shmem_team_t team_half; + int start = 1; + int stride = 1; + int team_size = n_ranks / 2; + shmem_team_split_strided(SHMEM_TEAM_WORLD, start, stride, team_size, &team_half); + + uint64_t *addr_dev = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host), sizeof(uint64_t)), 0); + + uint64_t *addr_dev_vec = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev_vec, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host_vec; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host_vec), sizeof(uint64_t)), 0); + + if (rank_id >= start && rank_id - start < team_size) { + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_do_half_team(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks, team_half); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ(aclrtMemcpy(addr_host, sizeof(uint64_t), addr_dev, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host), i); + shm::shmemi_control_barrier_all(); + } + + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] vec barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_vec_do_half_team(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev_vec, rank_id, n_ranks, + team_half); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ( + aclrtMemcpy(addr_host_vec, sizeof(uint64_t), addr_dev_vec, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host_vec), i); + shm::shmemi_control_barrier_all(); + } + } + + ASSERT_EQ(aclrtFreeHost(addr_host), 0); + shmem_free(addr_dev); + ASSERT_EQ(aclrtFreeHost(addr_host_vec), 0); + shmem_free(addr_dev_vec); + + shmem_team_destroy(team_half); + + test_finalize(stream, device_id); +} + +static void test_barrier_black_box_one_pe_team(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) +{ + int32_t device_id = rank_id % test_gnpu_num + test_first_npu; + aclrtStream stream; + test_init(rank_id, n_ranks, local_mem_size, &stream); + ASSERT_NE(stream, nullptr); + + shmem_team_t team_one_pe; + int start = 1; + int stride = 1; + int team_size = 1; + shmem_team_split_strided(SHMEM_TEAM_WORLD, start, stride, team_size, &team_one_pe); + + uint64_t *addr_dev = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host), sizeof(uint64_t)), 0); + + uint64_t *addr_dev_vec = static_cast(shmem_malloc(sizeof(uint64_t))); + ASSERT_EQ(aclrtMemset(addr_dev_vec, sizeof(uint64_t), 0, sizeof(uint64_t)), 0); + uint64_t *addr_host_vec; + ASSERT_EQ(aclrtMallocHost(reinterpret_cast(&addr_host_vec), sizeof(uint64_t)), 0); + + if (rank_id == 1) { + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_do_one_pe_team(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks, team_one_pe); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ(aclrtMemcpy(addr_host, sizeof(uint64_t), addr_dev, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host), i); + shm::shmemi_control_barrier_all(); + } + + for (int32_t i = 1; i <= SHMEM_BARRIER_TEST_NUM; i++) { + std::cout << "[TEST] vec barriers test blackbox rank_id: " << rank_id << " time: " << i << std::endl; + increase_vec_do_one_pe_team(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev_vec, rank_id, n_ranks, + team_one_pe); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ( + aclrtMemcpy(addr_host_vec, sizeof(uint64_t), addr_dev_vec, sizeof(uint64_t), ACL_MEMCPY_DEVICE_TO_HOST), + 0); + ASSERT_EQ((*addr_host_vec), i); + shm::shmemi_control_barrier_all(); + } } + + ASSERT_EQ(aclrtFreeHost(addr_host), 0); + shmem_free(addr_dev); + ASSERT_EQ(aclrtFreeHost(addr_host_vec), 0); + shmem_free(addr_dev_vec); + + shmem_team_destroy(team_one_pe); + + test_finalize(stream, device_id); } TEST(TEST_SYNC_API, test_barrier_black_box) @@ -139,3 +330,24 @@ TEST(TEST_SYNC_API, test_barrier_black_box_odd_team) uint64_t local_mem_size = 1024UL * 1024UL * 16; test_mutil_task(test_barrier_black_box_odd_team, local_mem_size, process_count); } + +TEST(TEST_SYNC_API, test_barrier_black_box_team_part_pes) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_barrier_black_box_team_part_pes, local_mem_size, process_count); +} + +TEST(TEST_SYNC_API, test_barrier_black_box_half_team) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_barrier_black_box_half_team, local_mem_size, process_count); +} + +TEST(TEST_SYNC_API, test_barrier_black_box_one_pe_team) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_barrier_black_box_one_pe_team, local_mem_size, process_count); +} diff --git a/tests/unittest/host/sync/config/config_host_test.cpp b/tests/unittest/host/sync/config/config_host_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7ddacf47b4449f9b6e8d49b15988a01610bda5e0 --- /dev/null +++ b/tests/unittest/host/sync/config/config_host_test.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include + +#include "acl/acl.h" +#include "shmem_api.h" +#include "shmemi_host_common.h" +#include "unittest_main_test.h" + +static void test_get_ffts_config_lifecycle(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) +{ + const uint64_t cfg_before = shmemx_get_ffts_config(); + EXPECT_EQ(cfg_before, 0UL); + + int32_t device_id = rank_id % test_gnpu_num + test_first_npu; + aclrtStream stream {nullptr}; + test_init(rank_id, n_ranks, local_mem_size, &stream); + ASSERT_NE(stream, nullptr); + + const uint64_t cfg_after_init = shmemx_get_ffts_config(); + ASSERT_NE(cfg_after_init, 0UL); + + for (int i = 0; i < 256; ++i) { + ASSERT_EQ(shmemx_get_ffts_config(), cfg_after_init); + } + + test_finalize(stream, device_id); +} + +TEST(TEST_SYNC_API, test_get_ffts_config_lifecycle) +{ + const int32_t process_count = test_gnpu_num; + const uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_get_ffts_config_lifecycle, local_mem_size, process_count); +} diff --git a/tests/unittest/host/sync/order/order_host_test.cpp b/tests/unittest/host/sync/order/order_host_test.cpp index bd2c10f0ac2e434f491d8a2d548e3b216c92e020..48768577ebad137d03b009a11aae95d100d69cb9 100644 --- a/tests/unittest/host/sync/order/order_host_test.cpp +++ b/tests/unittest/host/sync/order/order_host_test.cpp @@ -18,6 +18,8 @@ #include "unittest_main_test.h" #include "order_kernel.h" +constexpr int32_t SHMEM_ORDER_TEST_NUM = 1024; + static void test_quiet_order(int32_t rank_id, int32_t n_ranks, uint64_t local_mem_size) { aclrtStream stream; @@ -31,17 +33,20 @@ static void test_quiet_order(int32_t rank_id, int32_t n_ranks, uint64_t local_me std::vector host_buf(total_size, 0); - std::cout << "[TEST] fence order test rank " << rank_id << std::endl; - quiet_order_do(stream, shmemx_get_ffts_config(), (uint8_t *)dev_ptr, rank_id, n_ranks); + for (int32_t i = 1; i <= SHMEM_ORDER_TEST_NUM; i++) { + std::cout << "[TEST] quiet order test rank " << rank_id << " time: " << i << std::endl; + quiet_order_do(stream, shmemx_get_ffts_config(), (uint8_t *)dev_ptr, rank_id, n_ranks); - ASSERT_EQ(aclrtSynchronizeStream(stream), 0); - ASSERT_EQ(aclrtMemcpy(host_buf.data(), total_size * sizeof(uint64_t), dev_ptr, total_size * sizeof(uint64_t), - ACL_MEMCPY_DEVICE_TO_HOST), - 0); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ(aclrtMemcpy(host_buf.data(), total_size * sizeof(uint64_t), dev_ptr, total_size * sizeof(uint64_t), + ACL_MEMCPY_DEVICE_TO_HOST), + 0); - if (rank_id == 1) { - ASSERT_EQ(host_buf[33U], 0xBBu); - ASSERT_EQ(host_buf[34U], 0xAAu); + if (rank_id == 1) { + ASSERT_EQ(host_buf[33U], 0xBBu); + ASSERT_EQ(host_buf[34U], 0xAAu); + } + shm::shmemi_control_barrier_all(); } shmem_free(dev_ptr); @@ -65,18 +70,22 @@ static void test_fence_order(int32_t rank_id, int32_t n_ranks, uint64_t local_me std::vector addr_host(total_size, 0); - std::cout << "[TEST] fence order test rank " << rank_id << std::endl; - fence_order_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks); + for (int32_t i = 1; i <= SHMEM_ORDER_TEST_NUM; i++) { + std::cout << "[TEST] fence order test rank " << rank_id << " time: " << i << std::endl; + fence_order_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, n_ranks); - ASSERT_EQ(aclrtSynchronizeStream(stream), 0); - ASSERT_EQ(aclrtMemcpy(addr_host.data(), total_size * sizeof(uint64_t), addr_dev, total_size * sizeof(uint64_t), - ACL_MEMCPY_DEVICE_TO_HOST), - 0); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + ASSERT_EQ(aclrtMemcpy(addr_host.data(), total_size * sizeof(uint64_t), addr_dev, total_size * sizeof(uint64_t), + ACL_MEMCPY_DEVICE_TO_HOST), + 0); - if (rank_id == 1) { - ASSERT_EQ(addr_host[17U], 84u); - ASSERT_EQ(addr_host[18U], 42u); + if (rank_id == 1) { + ASSERT_EQ(addr_host[17U], 84u); + ASSERT_EQ(addr_host[18U], 42u); + } + shm::shmemi_control_barrier_all(); } + shmem_free(addr_dev); test_finalize(stream, device_id); diff --git a/tests/unittest/host/sync/p2p/p2p_host_test.cpp b/tests/unittest/host/sync/p2p/p2p_host_test.cpp index 63450f4ff8baa96c403641eafc677b8979d0dc12..0bdb6934f8d18a8a2adeea3e66be728334d00da1 100644 --- a/tests/unittest/host/sync/p2p/p2p_host_test.cpp +++ b/tests/unittest/host/sync/p2p/p2p_host_test.cpp @@ -30,9 +30,6 @@ static void test_p2p(int rank_id, int rank_size, uint64_t local_mem_size) int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; test_finalize(stream, dev_id); - if (::testing::Test::HasFailure()) { - exit(1); - } } TEST(TEST_SYNC_API, test_p2p) @@ -40,4 +37,124 @@ TEST(TEST_SYNC_API, test_p2p) const int32_t process_count = test_gnpu_num; uint64_t local_mem_size = 1024UL * 1024UL * 16; test_mutil_task(test_p2p, local_mem_size, process_count); +} + +static void test_p2p_cmp_ne(int rank_id, int rank_size, uint64_t local_mem_size) +{ + aclrtStream stream; + test_init(rank_id, rank_size, local_mem_size, &stream); + + int32_t *addr_dev = static_cast(shmem_malloc(sizeof(int32_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(int32_t), 0, sizeof(int32_t)), 0); + + p2p_chain_cmp_ne_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, rank_size); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + + shmem_free(addr_dev); + + int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; + test_finalize(stream, dev_id); +} + +TEST(TEST_SYNC_API, test_p2p_cmp_ne) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_p2p_cmp_ne, local_mem_size, process_count); +} + +static void test_p2p_cmp_gt(int rank_id, int rank_size, uint64_t local_mem_size) +{ + aclrtStream stream; + test_init(rank_id, rank_size, local_mem_size, &stream); + + int32_t *addr_dev = static_cast(shmem_malloc(sizeof(int32_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(int32_t), 0, sizeof(int32_t)), 0); + + p2p_chain_cmp_gt_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, rank_size); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + + shmem_free(addr_dev); + + int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; + test_finalize(stream, dev_id); +} + +TEST(TEST_SYNC_API, test_p2p_cmp_gt) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_p2p_cmp_gt, local_mem_size, process_count); +} + +static void test_p2p_cmp_ge(int rank_id, int rank_size, uint64_t local_mem_size) +{ + aclrtStream stream; + test_init(rank_id, rank_size, local_mem_size, &stream); + + int32_t *addr_dev = static_cast(shmem_malloc(sizeof(int32_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(int32_t), 0, sizeof(int32_t)), 0); + + p2p_chain_cmp_ge_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, rank_size); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + + shmem_free(addr_dev); + + int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; + test_finalize(stream, dev_id); +} + +TEST(TEST_SYNC_API, test_p2p_cmp_ge) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_p2p_cmp_ge, local_mem_size, process_count); +} + +static void test_p2p_cmp_lt(int rank_id, int rank_size, uint64_t local_mem_size) +{ + aclrtStream stream; + test_init(rank_id, rank_size, local_mem_size, &stream); + + int32_t *addr_dev = static_cast(shmem_malloc(sizeof(int32_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(int32_t), 0, sizeof(int32_t)), 0); + + p2p_chain_cmp_lt_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, rank_size); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + + shmem_free(addr_dev); + + int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; + test_finalize(stream, dev_id); +} + +TEST(TEST_SYNC_API, test_p2p_cmp_lt) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_p2p_cmp_lt, local_mem_size, process_count); +} + +static void test_p2p_cmp_le(int rank_id, int rank_size, uint64_t local_mem_size) +{ + aclrtStream stream; + test_init(rank_id, rank_size, local_mem_size, &stream); + + int32_t *addr_dev = static_cast(shmem_malloc(sizeof(int32_t))); + ASSERT_EQ(aclrtMemset(addr_dev, sizeof(int32_t), 0, sizeof(int32_t)), 0); + + p2p_chain_cmp_le_do(stream, shmemx_get_ffts_config(), (uint8_t *)addr_dev, rank_id, rank_size); + ASSERT_EQ(aclrtSynchronizeStream(stream), 0); + + shmem_free(addr_dev); + + int32_t dev_id = rank_id % test_gnpu_num + test_first_npu; + test_finalize(stream, dev_id); +} + +TEST(TEST_SYNC_API, test_p2p_cmp_le) +{ + const int32_t process_count = test_gnpu_num; + uint64_t local_mem_size = 1024UL * 1024UL * 16; + test_mutil_task(test_p2p_cmp_le, local_mem_size, process_count); } \ No newline at end of file