Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,8 @@ if (CUDECOMP_ENABLE_NVSHMEM)
list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1})
list(JOIN NVSHMEM_VERSION "." NVSHMEM_VERSION)

if (NVSHMEM_VERSION VERSION_LESS "2.5")
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem.a)
if (NVSHMEM_VERSION VERSION_LESS "2.6")
message(FATAL_ERROR "NVSHMEM versions earlier than 2.6.0 are not supported by cuDecomp.")
else()
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_host.so)
target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a)
Expand Down
8 changes: 5 additions & 3 deletions docs/nvshmem.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ In general, NVSHMEM operations requires memory it operates on to be allocated on
:code:`nvshmem_malloc`. While cuDecomp attempts to hide this complexity behind :code:`cudecompMalloc`, it is important
to understand that memory allocated for usage with NVSHMEM comes out of a separate memory pool than all other
CUDA allocations. At a high-level, NVSHMEM will preallocate this symmetric heap on each GPU when it is initialized,
with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE <https://docs.nvidia.com/hpc-sdk/nvshmem/api/docs/gen/env.html#c.NVSHMEM_SYMMETRIC_SIZE>`_ environment variable.
with the heap size set by the `NVSHMEM_SYMMETRIC_SIZE <https://docs.nvidia.com/nvshmem/api/gen/env.html#c.NVSHMEM_SYMMETRIC_SIZE>`_ environment variable.
As such, it is important to set the symmetric heap size to a value that is large enough for any necessary allocations from cuDecomp,
but not much larger as that will waste GPU memory space.

Expand All @@ -24,11 +24,13 @@ To help with this, the code will produce warnings like the following

if the library detects NVSHMEM allocations that may exceed the symmetric heap size, and suggests an appropriate value for :code:`NVSHMEM_SYMMETRIC_SIZE`.

Note that manual symmetric heap size management is only required if CUDA VMM features are disabled in NVSHMEM via :code:`NVSHMEM_DISABLE_CUDA_VMM=1`.

MPI compatibility
-----------------
As noted in the NVSHMEM documentation `here <https://docs.nvidia.com/hpc-sdk/nvshmem/api/docs/faq.html#interoperability-with-mpi-faqs>`_,
As noted in the NVSHMEM documentation `here <https://docs.nvidia.com/nvshmem/api/faq.html#interoperability-with-mpi-faqs>`_,
memory allocated on the symmetric heap may lead to crashes when used in MPI calls with some MPI implementations, especially when
CUDA VMM features in NVSHMEM are enabled. We strongly encourage users to set :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp
CUDA VMM features in NVSHMEM are enabled. If you find this is the case for your system, we suggest setting :code:`NVSHMEM_DISABLE_CUDA_VMM=1` when using cuDecomp
with NVSHMEM enabled. However, this is not always sufficient and MPI can still crash when passed NVSHMEM allocated memory.

Due to this, cuDecomp attempts to avoid using NVSHMEM-allocated memory with MPI where possible but it can arise in a couple of situations:
Expand Down
26 changes: 5 additions & 21 deletions include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ static inline void checkMpiInt32Limit(int64_t val, cudecompHaloCommBackend_t bac
}

#ifdef ENABLE_NVSHMEM
#define CUDECOMP_NVSHMEM_CHUNK_SZ (static_cast<size_t>(1024 * 1024 * 1024))
#define CUDECOMP_NVSHMEM_INTRAGROUP_SYNC_FREQ 8 // max number of intra-group transfers to schedule between team syncs
template <typename T>
static void
Expand Down Expand Up @@ -170,16 +169,9 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
}
}

// Use host call for direct P2P accessible entries
// Need to chunk host API calls due to 2 GiB limitation in API
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
size_t nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
for (size_t j = 0; j < nchunks; ++j) {
nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
std::min(CUDECOMP_NVSHMEM_CHUNK_SZ, send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
dst_rank_global, handle->streams[count % handle->device_p2p_ce_count]);
}
nvshmemx_putmem_on_stream(recv_buff + recv_offsets[dst_rank], send_buff + send_offsets[dst_rank],
send_counts[dst_rank] * sizeof(T), dst_rank_global,
handle->streams[count % handle->device_p2p_ce_count]);
count++;
}
}
Expand Down Expand Up @@ -440,16 +432,8 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
}

int dst_rank_global = getGlobalRank(handle, grid_desc, comm_axis, dst_rank);
// Need to chunk host API calls due to 2 GiB limitation in API
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
int nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
for (int j = 0; j < nchunks; ++j) {
nvshmemx_putmem_nbi_on_stream(
recv_buff + recv_offsets_nvshmem[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
std::min(static_cast<size_t>(CUDECOMP_NVSHMEM_CHUNK_SZ), send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
dst_rank_global, pl_stream);
}
nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets_nvshmem[dst_rank], send_buff + send_offsets[dst_rank],
send_counts[dst_rank] * sizeof(T), dst_rank_global, pl_stream);

barrier = true;
}
Expand Down
22 changes: 16 additions & 6 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -378,12 +378,6 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
char* vmm_str = std::getenv("NVSHMEM_DISABLE_CUDA_VMM");
if (vmm_str) { handle->nvshmem_vmm = std::strtol(vmm_str, nullptr, 10) == 0; }

if (handle->rank == 0 && handle->nvshmem_vmm) {
printf("CUDECOMP:WARN: NVSHMEM_DISABLE_CUDA_VMM is unset. We currently recommend setting it "
"(i.e. NVSHMEM_DISABLE_CUDA_VMM=1) for best compatibility with MPI libraries. See the documentation "
"for more details.\n");
}

// Check NVSHMEM_SYMMETRIC_SIZE
char* symmetric_size_str = std::getenv("NVSHMEM_SYMMETRIC_SIZE");
if (symmetric_size_str) {
Expand All @@ -405,6 +399,21 @@ static void inspectNvshmemEnvVars(cudecompHandle_t& handle) {
handle->nvshmem_symmetric_size = 1ull << 30;
}
}

static void checkNvshmemVersion() {
int major, minor, patch;
char name[NVSHMEM_MAX_NAME_LEN];
nvshmem_info_get_name(name);
const char* vpos = strchr(name, 'v');
if (!vpos || sscanf(vpos, "v%d.%d.%d", &major, &minor, &patch) != 3) {
THROW_INTERNAL_ERROR("Could not parse NVSHMEM version.");
}

// We have removed workarounds for bugs encountered with NVSHMEM versions earlier than 2.6.0.
if ((major == 2 && minor < 6) || major < 2) {
THROW_NOT_SUPPORTED("NVSHMEM versions earlier than 2.6.0 are not supported.");
}
}
#endif

} // namespace
Expand Down Expand Up @@ -691,6 +700,7 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
((autotune_transpose_backend || autotune_halo_backend) && !autotune_disable_nvshmem_backends)) {
#ifdef ENABLE_NVSHMEM
if (!handle->nvshmem_initialized) {
checkNvshmemVersion();
inspectNvshmemEnvVars(handle);
initNvshmemFromMPIComm(handle->mpi_comm);
handle->nvshmem_initialized = true;
Expand Down