Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ set(WEBGPU_SRCS
runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
runtime/ops/rope/RotaryEmbedding.cpp
runtime/ops/prepack/Prepack.cpp
runtime/ops/view_copy/ViewCopy.cpp
)

add_library(webgpu_backend ${WEBGPU_SRCS})
Expand Down
20 changes: 20 additions & 0 deletions backends/webgpu/runtime/WebGPUGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,16 @@ void WebGPUGraph::execute() {
// One pass per dispatch: enforces storage RAW ordering across deps.
for (size_t i = 0; i < n; i++) {
const auto& dispatch = dispatches_[i];
if (dispatch.kind == WebGPUDispatch::Kind::Copy) {
wgpuCommandEncoderCopyBufferToBuffer(
encoder,
dispatch.copy_src,
0,
dispatch.copy_dst,
0,
dispatch.copy_nbytes);
continue;
}
WGPUComputePassDescriptor pass_desc = {};
#ifdef WGPU_BACKEND_ENABLE_PROFILING
// tw must outlive BeginComputePass (the descriptor points at it).
Expand Down Expand Up @@ -757,6 +767,16 @@ void WebGPUGraph::execute() {
wgpuDeviceCreateCommandEncoder(device_, &enc_desc);

for (size_t i = start; i < end; i++) {
if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) {
wgpuCommandEncoderCopyBufferToBuffer(
encoder,
dispatches_[i].copy_src,
0,
dispatches_[i].copy_dst,
0,
dispatches_[i].copy_nbytes);
continue;
}
WGPUComputePassDescriptor pass_desc = {};
WGPUComputePassEncoder pass =
wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
Expand Down
17 changes: 17 additions & 0 deletions backends/webgpu/runtime/WebGPUGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ struct WebGPUDispatch {
WGPUBindGroup bind_group = nullptr;
uint32_t workgroup_count_x = 1;
std::string kernel_name; // bench label
// DMA copy command; default Compute keeps existing positional inits valid.
enum class Kind { Compute, Copy };
Kind kind = Kind::Compute;
WGPUBuffer copy_src = nullptr;
WGPUBuffer copy_dst = nullptr;
size_t copy_nbytes = 0;
};

struct OutputCopy {
Expand Down Expand Up @@ -189,6 +195,17 @@ class WebGPUGraph {
dispatches_.push_back(dispatch);
}

// Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy).
void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
WebGPUDispatch d;
d.kind = WebGPUDispatch::Kind::Copy;
d.copy_src = src;
d.copy_dst = dst;
d.copy_nbytes = nbytes;
d.kernel_name = "flat_copy";
dispatches_.push_back(d);
}

// Materialize a recorded prepack-routed constant into dst via one CPU->GPU
// transfer. Build-time only (the .pte bytes are freed after build()).
// Mirrors Vulkan prepack_standard.
Expand Down
62 changes: 62 additions & 0 deletions backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>

#include <stdexcept>
#include <vector>

namespace executorch::backends::webgpu {

void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) {
// get_tensor doesn't type-check; assert both args are tensors (fail loud).
if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
throw std::runtime_error("flat_copy: in/out arg is not a tensor");
}

const auto& in_tensor = graph.get_tensor(in_id);
const auto& out_tensor = graph.get_tensor(out_id);
// Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap).

// 4-byte alignment guard (fp32 element size); does not verify dtype.
if (in_tensor.nbytes % sizeof(float) != 0 ||
out_tensor.nbytes % sizeof(float) != 0) {
throw std::runtime_error("flat_copy: operand not 4-byte aligned");
}

// view preserves numel; this guard also prevents an OOB copy.
if (in_tensor.nbytes != out_tensor.nbytes) {
throw std::runtime_error("flat_copy: input/output size mismatch");
}

// Aliased in/out already in place; CopyBufferToBuffer rejects src == dst.
if (in_tensor.buffer == out_tensor.buffer) {
return;
}

graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes);
}

namespace {

// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer).
void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
// args: [self, size, out]; out = last value-id (shape from out_tensor.dims).
add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
}

} // namespace

WEBGPU_REGISTER_OPERATORS {
WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
}

} // namespace executorch::backends::webgpu
18 changes: 18 additions & 0 deletions backends/webgpu/runtime/ops/view_copy/view_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>

namespace executorch::backends::webgpu {

// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h).
void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id);

} // namespace executorch::backends::webgpu
Loading