pytorch · meta-codesync · Jun 26, 2026 · Jun 17, 2026 · Jun 18, 2026 · Jun 18, 2026
@@ -42,6 +42,7 @@ set(WEBGPU_SRCS
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
     runtime/ops/prepack/Prepack.cpp
+    runtime/ops/view_copy/ViewCopy.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -679,6 +679,16 @@ void WebGPUGraph::execute() {
     // One pass per dispatch: enforces storage RAW ordering across deps.
     for (size_t i = 0; i < n; i++) {
       const auto& dispatch = dispatches_[i];
+      if (dispatch.kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatch.copy_src,
+            0,
+            dispatch.copy_dst,
+            0,
+            dispatch.copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
       // tw must outlive BeginComputePass (the descriptor points at it).
@@ -757,6 +767,16 @@ void WebGPUGraph::execute() {
         wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
 
     for (size_t i = start; i < end; i++) {
+      if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatches_[i].copy_src,
+            0,
+            dispatches_[i].copy_dst,
+            0,
+            dispatches_[i].copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
       WGPUComputePassEncoder pass =
           wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);

diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -42,6 +42,12 @@ struct WebGPUDispatch {
   WGPUBindGroup bind_group = nullptr;
   uint32_t workgroup_count_x = 1;
   std::string kernel_name; // bench label
+  // DMA copy command; default Compute keeps existing positional inits valid.
+  enum class Kind { Compute, Copy };
+  Kind kind = Kind::Compute;
+  WGPUBuffer copy_src = nullptr;
+  WGPUBuffer copy_dst = nullptr;
+  size_t copy_nbytes = 0;
 };
 
 struct OutputCopy {
@@ -189,6 +195,17 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy).
+  void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
+    WebGPUDispatch d;
+    d.kind = WebGPUDispatch::Kind::Copy;
+    d.copy_src = src;
+    d.copy_dst = dst;
+    d.copy_nbytes = nbytes;
+    d.kernel_name = "flat_copy";
+    dispatches_.push_back(d);
+  }
+
   // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
   // transfer. Build-time only (the .pte bytes are freed after build()).
   // Mirrors Vulkan prepack_standard.

diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) {
+  // get_tensor doesn't type-check; assert both args are tensors (fail loud).
+  if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
+      graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("flat_copy: in/out arg is not a tensor");
+  }
+
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap).
+
+  // 4-byte alignment guard (fp32 element size); does not verify dtype.
+  if (in_tensor.nbytes % sizeof(float) != 0 ||
+      out_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error("flat_copy: operand not 4-byte aligned");
+  }
+
+  // view preserves numel; this guard also prevents an OOB copy.
+  if (in_tensor.nbytes != out_tensor.nbytes) {
+    throw std::runtime_error("flat_copy: input/output size mismatch");
+  }
+
+  // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst.
+  if (in_tensor.buffer == out_tensor.buffer) {
+    return;
+  }
+
+  graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes);
+}
+
+namespace {
+
+// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer).
+void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, size, out]; out = last value-id (shape from out_tensor.dims).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+namespace executorch::backends::webgpu {
+
+// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h).
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id);
+
+} // namespace executorch::backends::webgpu