From 6e3b496ac46288553e90ab0ef659e26e7ccbb043 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1@llnl.gov>
Date: Sat, 30 Aug 2025 00:43:52 -0700
Subject: [PATCH 1/2] [clang][CUDA] Avoid accounting for tail padding in LLVM
 offloading

---
 clang/lib/CodeGen/CGCUDANV.cpp                | 24 +++++++++++++++----
 offload/include/Shared/APITypes.h             |  2 ++
 .../common/src/PluginInterface.cpp            |  4 +++-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  2 +-
 .../offloading/CUDA/basic_launch_multi_arg.cu |  8 +++++++
 5 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index cb16fe1b36c68..59fb4b9e94bf9 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -327,9 +327,10 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
 /// (void*, short, void*) is passed as {void **, short *, void **} to the launch
 /// function. For the LLVM/offload launch we flatten the arguments into the
 /// struct directly. In addition, we include the size of the arguments, thus
-/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
-/// nullptr}. The last nullptr needs to be initialized to an array of pointers
-/// pointing to the arguments if we want to offload to the host.
+/// pass {size of ({void *, short, void *}) without tail padding, ptr to {void
+/// *, short, void *}, nullptr}. The last nullptr needs to be initialized to an
+/// array of pointers pointing to the arguments if we want to offload to the
+/// host.
 Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
                                                       FunctionArgList &Args) {
   SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
@@ -339,6 +340,7 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
 
   auto *Int64Ty = CGF.Builder.getInt64Ty();
   KernelLaunchParamsTypes.push_back(Int64Ty);
+  KernelLaunchParamsTypes.push_back(Int64Ty);
   KernelLaunchParamsTypes.push_back(PtrTy);
   KernelLaunchParamsTypes.push_back(PtrTy);
 
@@ -351,12 +353,24 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
       "kernel_launch_params");
 
   auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
+
+  // Avoid accounting the tail padding for CUDA.
+  auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero();
+  if (auto N = KernelArgsTy->getNumElements()) {
+    auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy);
+    KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1);
+    KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize(
+        KernelArgsTy->getElementType(N - 1));
+  }
+
   CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
-  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
-  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 3));
 
   for (unsigned i = 0; i < Args.size(); ++i) {
     auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..52725a0474c6a 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -121,6 +121,8 @@ static_assert(sizeof(KernelArgsTy) ==
 struct KernelLaunchParamsTy {
   /// Size of the Data array.
   size_t Size = 0;
+  /// Size of the Data array without tail padding.
+  size_t SizeNoTailPadding = 0;
   /// Flat array of kernel parameters.
   void *Data = nullptr;
   /// Ptrs to the Data entries. Only strictly required for the host plugin.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 7d05dd25dbf75..979c026c219a6 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -571,7 +571,9 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
         (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
     Ptrs[I] = &Args[I];
   }
-  return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
+
+  size_t ArgsSize = sizeof(void *) * NumArgs;
+  return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]};
 }
 
 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b2f840113cff3..81b33a93f16b1 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1414,7 +1414,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&LaunchParams.Size),
+                    reinterpret_cast<void *>(&LaunchParams.SizeNoTailPadding),
                     CU_LAUNCH_PARAM_END};
 
   // If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 1f84a0e1288d4..ab6f753150932 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -23,6 +23,10 @@ __global__ void square(int *Dst, short Q, int *Src, short P) {
   Src[1] = P;
 }
 
+__global__ void accumulate(short Q, int *Dst, char P) {
+  *Dst += Q + P;
+}
+
 int main(int argc, char **argv) {
   int DevNo = 0;
   int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
@@ -39,5 +43,9 @@ int main(int argc, char **argv) {
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   printf("Src: %i : %i\n", Src[0], Src[1]);
   // CHECK: Src: 3 : 4
+  accumulate<<<1, 1>>>(3, Ptr, 7);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 52
   llvm_omp_target_free_shared(Ptr, DevNo);
+  llvm_omp_target_free_shared(Src, DevNo);
 }

From 08dfe946eebd0b5d82a04f4ec79467129a4e3f91 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1@llnl.gov>
Date: Thu, 4 Sep 2025 16:01:09 -0700
Subject: [PATCH 2/2] Use the same args size (without tail padding) for AMDGPU

---
 clang/lib/CodeGen/CGCUDANV.cpp                  | 17 ++++++-----------
 offload/include/Shared/APITypes.h               |  4 +---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp      |  5 -----
 .../common/src/PluginInterface.cpp              |  2 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp        |  2 +-
 5 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 59fb4b9e94bf9..a0aeb21bee380 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -340,7 +340,6 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
 
   auto *Int64Ty = CGF.Builder.getInt64Ty();
   KernelLaunchParamsTypes.push_back(Int64Ty);
-  KernelLaunchParamsTypes.push_back(Int64Ty);
   KernelLaunchParamsTypes.push_back(PtrTy);
   KernelLaunchParamsTypes.push_back(PtrTy);
 
@@ -352,25 +351,21 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
       KernelLaunchParamsTy, CharUnits::fromQuantity(16),
       "kernel_launch_params");
 
-  auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
-
-  // Avoid accounting the tail padding for CUDA.
-  auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero();
+  // Avoid accounting the tail padding for the kernel arguments.
+  auto KernelArgsSize = llvm::TypeSize::getZero();
   if (auto N = KernelArgsTy->getNumElements()) {
     auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy);
-    KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1);
-    KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize(
+    KernelArgsSize += SL->getElementOffset(N - 1);
+    KernelArgsSize += CGM.getDataLayout().getTypeAllocSize(
         KernelArgsTy->getElementType(N - 1));
   }
 
   CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
                           CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
-  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding),
-                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
   CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
-                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
   CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
-                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 3));
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
 
   for (unsigned i = 0; i < Args.size(); ++i) {
     auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 52725a0474c6a..9868370de3d73 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -119,10 +119,8 @@ static_assert(sizeof(KernelArgsTy) ==
 
 /// Flat array of kernel launch parameters and their total size.
 struct KernelLaunchParamsTy {
-  /// Size of the Data array.
+  /// Size of the Data array without the tail padding.
   size_t Size = 0;
-  /// Size of the Data array without tail padding.
-  size_t SizeNoTailPadding = 0;
   /// Flat array of kernel parameters.
   void *Data = nullptr;
   /// Ptrs to the Data entries. Only strictly required for the host plugin.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 64470e9fabf46..dc0e5c3ba11ef 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3658,11 +3658,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  if (ArgsSize != LaunchParams.Size &&
-      ArgsSize > LaunchParams.Size + getImplicitArgsSize())
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "invalid kernel arguments size");
-
   AMDGPUPluginTy &AMDGPUPlugin =
       static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
   AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 979c026c219a6..56006d03ef325 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -573,7 +573,7 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
   }
 
   size_t ArgsSize = sizeof(void *) * NumArgs;
-  return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]};
+  return KernelLaunchParamsTy{ArgsSize, &Args[0], &Ptrs[0]};
 }
 
 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 81b33a93f16b1..b2f840113cff3 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1414,7 +1414,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&LaunchParams.SizeNoTailPadding),
+                    reinterpret_cast<void *>(&LaunchParams.Size),
                     CU_LAUNCH_PARAM_END};
 
   // If we are running an RPC server we want to wake up the server thread