diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c4e21a..8e81fce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,8 +232,8 @@ if (CUDECOMP_ENABLE_NVSHMEM) list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1}) list(JOIN NVSHMEM_VERSION "." NVSHMEM_VERSION) - if (NVSHMEM_VERSION VERSION_LESS "2.5") - target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem.a) + if (NVSHMEM_VERSION VERSION_LESS "2.6") + message(FATAL_ERROR "NVSHMEM versions earlier than 2.6.0 are not supported by cuDecomp.") else() target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_host.so) target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a) diff --git a/docs/nvshmem.rst b/docs/nvshmem.rst index e1b5a76..7e4da3c 100644 --- a/docs/nvshmem.rst +++ b/docs/nvshmem.rst @@ -12,7 +12,7 @@ In general, NVSHMEM operations requires memory it operates on to be allocated on :code:`nvshmem_malloc`. While cuDecomp attempts to hide this complexity behind :code:`cudecompMalloc`, it is important to understand that memory allocated for usage with NVSHMEM comes out of a separate memory pool than all other CUDA allocations. At a high-level, NVSHMEM will preallocate this symmetric heap on each GPU when it is initialized, -with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE `_ environment variable. +with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE `_ environment variable. As such, it is important to set the symmetric heap size to a value that is large enough for any necessary allocations from cuDecomp, but not much larger as that will waste GPU memory space. @@ -24,11 +24,13 @@ To help with this, the code will produce warnings like the following if the library detects NVSHMEM allocations that may exceed the symmetric heap size, and suggests an appropriate value for :code:`NVSHMEM_SYMMETRIC_SIZE`. +Note that manual symmetric heap size management is only required if CUDA VMM features are disabled in NVSHMEM via :code:`NVSHMEM_DISABLE_CUDA_VMM=1`. + MPI compatibility ----------------- -As noted in the NVSHMEM documentation `here `_, +As noted in the NVSHMEM documentation `here `_, memory allocated on the symmetric heap may lead to crashes when used in MPI calls with some MPI implementations, especially when -CUDA VMM features in NVSHMEM are enabled. We strongly encourage users to set :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp +CUDA VMM features in NVSHMEM are enabled. If you find this is the case for your system, we suggest setting :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp with NVSHMEM enabled. However, this is not always sufficient and MPI can still crash when passed NVSHMEM allocated memory. Due to this, cuDecomp attempts to avoid using NVSHMEM-allocated memory with MPI where possible but it can arise in a couple of situations: diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index efc2fe6..9d7ab45 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -88,7 +88,6 @@ static inline void checkMpiInt32Limit(int64_t val, cudecompHaloCommBackend_t bac } #ifdef ENABLE_NVSHMEM -#define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast(1024 * 1024 * 1024)) #define CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ 8 // max number of intra-group transfers to schedule between team syncs template static void @@ -170,16 +169,9 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_ } } - // Use host call for direct P2P accessible entries - // Need to chunk host API calls due to 2 GiB limitation in API - size_t send_bytes = send_counts[dst_rank] * sizeof(T); - size_t nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ; - for (size_t j = 0; j < nchunks; ++j) { - nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)), - send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)), - std::min(CUDECOMP_NVSHMEM_CHUNK_SZ, send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ), - dst_rank_global, handle->streams[count % handle->device_p2p_ce_count]); - } + nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank], + send_counts[dst_rank] * sizeof(T), dst_rank_global, + handle->streams[count % handle->device_p2p_ce_count]); count++; } } @@ -440,16 +432,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc } int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank); - // Need to chunk host API calls due to 2 GiB limitation in API - size_t send_bytes = send_counts[dst_rank] * sizeof(T); - int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ; - for (int j = 0; j < nchunks; ++j) { - nvshmemx_putmem_nbi_on_stream( - recv_buff + recv_offsets_nvshmem[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)), - send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)), - std::min(static_cast(CUDECOMP_NVSHMEM_CHUNK_SZ), send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ), - dst_rank_global, pl_stream); - } + nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank], + send_counts[dst_rank] * sizeof(T), dst_rank_global, pl_stream); barrier = true; } diff --git a/src/cudecomp.cc b/src/cudecomp.cc index d600d4c..5df3435 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -378,12 +378,6 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) { char* vmm_str = std::getenv("NVSHMEM_DISABLE_CUDA_VMM"); if (vmm_str) { handle->nvshmem_vmm = std::strtol(vmm_str, nullptr, 10) == 0; } - if (handle->rank == 0 && handle->nvshmem_vmm) { - printf("CUDECOMP:WARN: NVSHMEM_DISABLE_CUDA_VMM is unset. We currently recommend setting it " - "(i.e. NVSHMEM_DISABLE_CUDA_VMM=1) for best compatibility with MPI libraries. See the documentation " - "for more details.\n"); - } - // Check NVSHMEM_SYMMETRIC_SIZE char* symmetric_size_str = std::getenv("NVSHMEM_SYMMETRIC_SIZE"); if (symmetric_size_str) { @@ -405,6 +399,21 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) { handle->nvshmem_symmetric_size = 1ull << 30; } } + +static void checkNvshmemVersion() { + int major, minor, patch; + char name[NVSHMEM_MAX_NAME_LEN]; + nvshmem_info_get_name(name); + const char* vpos = strchr(name, 'v'); + if (!vpos || sscanf(vpos, "v%d.%d.%d", &major, &minor, &patch) != 3) { + THROW_INTERNAL_ERROR("Could not parse NVSHMEM version."); + } + + // We have removed workarounds for bugs encountered with NVSHMEM versions earlier than 2.6.0. + if ((major == 2 && minor < 6) || major < 2) { + THROW_NOT_SUPPORTED("NVSHMEM versions earlier than 2.6.0 are not supported."); + } +} #endif } // namespace @@ -691,6 +700,7 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes ((autotune_transpose_backend || autotune_halo_backend) && !autotune_disable_nvshmem_backends)) { #ifdef ENABLE_NVSHMEM if (!handle->nvshmem_initialized) { + checkNvshmemVersion(); inspectNvshmemEnvVars(handle); initNvshmemFromMPIComm(handle->mpi_comm); handle->nvshmem_initialized = true;