diff --git a/src/driver/main.cpp b/src/driver/main.cpp index e96f7f7cb23..fae90188e01 100644 --- a/src/driver/main.cpp +++ b/src/driver/main.cpp @@ -984,9 +984,11 @@ struct time_cmd : command { compiler c; unsigned n = 100; + unsigned nbuffers = 1; void parse(argument_parser& ap) { ap(n, {"--iterations", "-n"}, ap.help("Number of iterations to run.")); + ap(nbuffers, {"--buffers", "-b"}, ap.help("Number of rotated buffers to use.")); c.parse(ap); } @@ -994,9 +996,14 @@ struct time_cmd : command { auto p = c.compile(); log::info() << "Allocating params ..."; - auto m = c.params(p); + std::vector ms; + for(auto i : range(nbuffers)) + { + (void)i; + ms.push_back(c.params(p)); + } log::info() << "Running ..."; - double t = time_run(p, m, n); + double t = time_run(p, ms, n); std::cout << "Total time: " << t << "ms" << std::endl; } }; diff --git a/src/driver/perf.cpp b/src/driver/perf.cpp index fdd20331163..55964e5f6c7 100644 --- a/src/driver/perf.cpp +++ b/src/driver/perf.cpp @@ -140,16 +140,15 @@ bool is_offload_copy_set(const program& p) return param_ins.empty(); } -double time_run(const program& p, const parameter_map& m, int n) +double time_run(const program& p, const std::vector& ms, int n) { // Run once without timing - p.eval(m); + p.eval(ms.back()); p.finish(); double total = time([&] { for(auto i : range(n)) { - (void)i; - p.eval(m); + p.eval(ms[i % ms.size()]); } p.finish(); }); diff --git a/src/driver/perf.hpp b/src/driver/perf.hpp index 4602be38140..4066c6b7eba 100644 --- a/src/driver/perf.hpp +++ b/src/driver/perf.hpp @@ -49,7 +49,7 @@ target get_target(bool gpu); */ bool is_offload_copy_set(const program& p); -double time_run(const program& p, const parameter_map& m, int n = 100); +double time_run(const program& p, const std::vector& ms, int n = 100); } // namespace MIGRAPHX_INLINE_NS } // namespace driver diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp index d97355a1d5c..7c2c2cd02f7 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp @@ -32,14 +32,54 @@ #include #include #include +#include namespace migraphx { +// Unsigned integer with the same size as T, used to move struct element types (eg fp8) +// through a builtin that only accepts arithmetic and vector types. +template +using nontemporal_storage = conditional_t< + sizeof(T) == 1, + uint8_t, + conditional_t>>; + +// Load a single value with a nontemporal hint so it bypasses the cache. The builtin only +// accepts arithmetic and vector types, so any other trivially-copyable type is loaded +// through a same-sized integer and bit-cast back. +template +__device__ T nontemporal_load(const T* ptr) +{ + if constexpr(is_integral{} or is_floating_point{} or is_any_vec()) + { + return __builtin_nontemporal_load(ptr); + } + else + { + static_assert(is_trivially_copyable{}); + using storage = nontemporal_storage; + static_assert(sizeof(storage) == sizeof(T)); + return bit_cast(__builtin_nontemporal_load(reinterpret_cast(ptr))); + } +} + +// Read an element from an input tensor. Inputs that are not broadcasted are read only +// once, so a nontemporal load avoids polluting the cache. Broadcasted inputs reuse the +// same element across threads, so a regular cached load is kept for them. +template +__device__ auto pointwise_load(const T& x, I i) +{ + if constexpr(get_shape_c{}.broadcasted()) + return x[i]; + else + return nontemporal_load(&x[i]); +} + template __device__ void pointwise_tensor(Stride stride, F f, Output out, T x, Ts... xs) { stride(x.get_shape().elements(), [&](auto i) { - auto r = f(x[i], xs[i]...); + auto r = f(pointwise_load(x, i), pointwise_load(xs, i)...); out([&](auto... outs) { r([&](auto... rs) { static_assert(sizeof...(outs) == sizeof...(rs));