From 43ebddd283e4c4ed9470a4ce5790d7efbca6890b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Fri, 20 Feb 2026 19:22:22 +0100 Subject: [PATCH] fix: speed up and simplify scrollintoview @ref --- README.md | 4 +- .../RunnerTests.swift | 55 +++++++++-- src/core/dispatch.ts | 7 +- src/daemon/__tests__/scroll-planner.test.ts | 11 +++ .../handlers/__tests__/interaction.test.ts | 25 +---- src/daemon/handlers/interaction.ts | 92 +------------------ src/daemon/scroll-planner.ts | 17 +++- website/docs/docs/commands.md | 4 +- 8 files changed, 90 insertions(+), 125 deletions(-) diff --git a/README.md b/README.md index e2dda23e..6ed8b9ee 100644 --- a/README.md +++ b/README.md @@ -197,8 +197,8 @@ Pinch: Swipe timing: - `swipe` accepts optional `durationMs` (default `250`, range `16..10000`). - Android uses requested swipe duration directly. -- iOS uses a safe normalized duration to avoid longpress side effects. -- `scrollintoview` accepts either plain text or a snapshot ref (`@eN`); ref mode uses geometry-based scrolling. +- iOS clamps swipe duration to a safe range (`16..60ms`) to avoid longpress side effects. +- `scrollintoview` accepts either plain text or a snapshot ref (`@eN`); ref mode uses best-effort geometry-based scrolling without post-scroll verification. Run `snapshot` again before follow-up `@ref` commands. ## Skills Install the automation skills listed in [SKILL.md](skills/agent-device/SKILL.md). diff --git a/ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift b/ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift index f0b0c58d..2c56fbfc 100644 --- a/ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift +++ b/ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift @@ -40,6 +40,7 @@ final class RunnerTests: XCTestCase { private let retryCooldown: TimeInterval = 0.2 private let postSnapshotInteractionDelay: TimeInterval = 0.2 private let firstInteractionAfterActivateDelay: TimeInterval = 0.25 + private let scrollInteractionIdleTimeoutDefault: TimeInterval = 1.0 private let minRecordingFps = 1 private let maxRecordingFps = 120 private var needsPostSnapshotInteractionDelay = false @@ -712,7 +713,9 @@ final class RunnerTests: XCTestCase { return Response(ok: false, error: ErrorPayload(message: "drag requires x, y, x2, and y2")) } let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0) - dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration) + withTemporaryScrollIdleTimeoutIfSupported(activeApp) { + dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration) + } return Response(ok: true, data: DataPayload(message: "dragged")) case .dragSeries: guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else { @@ -725,12 +728,14 @@ final class RunnerTests: XCTestCase { return Response(ok: false, error: ErrorPayload(message: "dragSeries pattern must be one-way or ping-pong")) } let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0) - runSeries(count: count, pauseMs: pauseMs) { idx in - let reverse = pattern == "ping-pong" && (idx % 2 == 1) - if reverse { - dragAt(app: activeApp, x: x2, y: y2, x2: x, y2: y, holdDuration: holdDuration) - } else { - dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration) + withTemporaryScrollIdleTimeoutIfSupported(activeApp) { + runSeries(count: count, pauseMs: pauseMs) { idx in + let reverse = pattern == "ping-pong" && (idx % 2 == 1) + if reverse { + dragAt(app: activeApp, x: x2, y: y2, x2: x, y2: y, holdDuration: holdDuration) + } else { + dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration) + } } } return Response(ok: true, data: DataPayload(message: "drag series")) @@ -756,7 +761,9 @@ final class RunnerTests: XCTestCase { guard let direction = command.direction else { return Response(ok: false, error: ErrorPayload(message: "swipe requires direction")) } - swipe(app: activeApp, direction: direction) + withTemporaryScrollIdleTimeoutIfSupported(activeApp) { + swipe(app: activeApp, direction: direction) + } return Response(ok: true, data: DataPayload(message: "swiped")) case .findText: guard let text = command.text else { @@ -884,6 +891,38 @@ final class RunnerTests: XCTestCase { return target } + private func withTemporaryScrollIdleTimeoutIfSupported( + _ target: XCUIApplication, + operation: () -> Void + ) { + let setter = NSSelectorFromString("setWaitForIdleTimeout:") + guard target.responds(to: setter) else { + operation() + return + } + let previous = target.value(forKey: "waitForIdleTimeout") as? NSNumber + target.setValue(resolveScrollInteractionIdleTimeout(), forKey: "waitForIdleTimeout") + defer { + if let previous { + target.setValue(previous.doubleValue, forKey: "waitForIdleTimeout") + } + } + operation() + } + + private func resolveScrollInteractionIdleTimeout() -> TimeInterval { + guard + let raw = ProcessInfo.processInfo.environment["AGENT_DEVICE_IOS_INTERACTION_IDLE_TIMEOUT"], + !raw.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + else { + return scrollInteractionIdleTimeoutDefault + } + guard let parsed = Double(raw), parsed >= 0 else { + return scrollInteractionIdleTimeoutDefault + } + return min(parsed, 30) + } + private func shouldRetryCommand(_ command: Command) -> Bool { if isEnvTruthy("AGENT_DEVICE_RUNNER_DISABLE_READONLY_RETRY") { return false diff --git a/src/core/dispatch.ts b/src/core/dispatch.ts index e53df870..da964f9f 100644 --- a/src/core/dispatch.ts +++ b/src/core/dispatch.ts @@ -219,7 +219,7 @@ export async function dispatchCommand( const requestedDurationMs = positionals[4] ? Number(positionals[4]) : 250; const durationMs = requireIntInRange(requestedDurationMs, 'durationMs', 16, 10_000); - const effectiveDurationMs = device.platform === 'ios' ? 60 : durationMs; + const effectiveDurationMs = device.platform === 'ios' ? clampIosSwipeDuration(durationMs) : durationMs; const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200); const pauseMs = requireIntInRange(context?.pauseMs ?? 0, 'pause-ms', 0, 10_000); const pattern = context?.pattern ?? 'one-way'; @@ -511,6 +511,11 @@ function requireIntInRange(value: number, name: string, min: number, max: number return value; } +function clampIosSwipeDuration(durationMs: number): number { + // Keep iOS swipes stable while allowing explicit fast durations for scroll-heavy flows. + return Math.min(60, Math.max(16, Math.round(durationMs))); +} + export function shouldUseIosTapSeries( device: DeviceInfo, count: number, diff --git a/src/daemon/__tests__/scroll-planner.test.ts b/src/daemon/__tests__/scroll-planner.test.ts index dd6cb4c3..d4f8516b 100644 --- a/src/daemon/__tests__/scroll-planner.test.ts +++ b/src/daemon/__tests__/scroll-planner.test.ts @@ -36,6 +36,9 @@ test('buildScrollIntoViewPlan computes downward content scroll when target is be assert.ok(plan); assert.equal(plan?.direction, 'down'); assert.ok((plan?.count ?? 0) > 1); + assert.equal(plan?.x, 80); + assert.equal(plan?.startY, 726); + assert.equal(plan?.endY, 118); }); test('buildScrollIntoViewPlan returns null when already in safe viewport band', () => { @@ -45,3 +48,11 @@ test('buildScrollIntoViewPlan returns null when already in safe viewport band', assert.equal(plan, null); assert.equal(isRectWithinSafeViewportBand(targetRect, viewportRect), true); }); + +test('buildScrollIntoViewPlan keeps swipe lane inside viewport when target center is out of bounds', () => { + const targetRect = { x: 1000, y: 2100, width: 120, height: 40 }; + const viewportRect = { x: 0, y: 0, width: 390, height: 844 }; + const plan = buildScrollIntoViewPlan(targetRect, viewportRect); + assert.ok(plan); + assert.equal(plan?.x, 351); +}); diff --git a/src/daemon/handlers/__tests__/interaction.test.ts b/src/daemon/handlers/__tests__/interaction.test.ts index f3059ac4..dea4f792 100644 --- a/src/daemon/handlers/__tests__/interaction.test.ts +++ b/src/daemon/handlers/__tests__/interaction.test.ts @@ -214,7 +214,6 @@ test('scrollintoview @ref dispatches geometry-based swipe series', async () => { positionals: string[]; context: Record | undefined; }> = []; - let snapshotCallCount = 0; const response = await handleInteractionCommands({ req: { token: 't', @@ -227,16 +226,6 @@ test('scrollintoview @ref dispatches geometry-based swipe series', async () => { sessionStore, contextFromFlags, dispatch: async (_device, command, positionals, _out, context) => { - if (command === 'snapshot') { - snapshotCallCount += 1; - return { - nodes: [ - { index: 0, type: 'Application', rect: { x: 0, y: 0, width: 390, height: 844 } }, - { index: 1, type: 'XCUIElementTypeStaticText', label: 'Far item', rect: { x: 20, y: 320, width: 120, height: 40 } }, - ], - backend: 'xctest', - }; - } dispatchCalls.push({ command, positionals, context: context as Record | undefined }); return { ok: true }; }, @@ -244,7 +233,6 @@ test('scrollintoview @ref dispatches geometry-based swipe series', async () => { assert.ok(response); assert.equal(response.ok, true); - assert.equal(snapshotCallCount, 1); assert.equal(dispatchCalls.length, 1); assert.equal(dispatchCalls[0]?.command, 'swipe'); assert.equal(dispatchCalls[0]?.positionals.length, 5); @@ -259,8 +247,6 @@ test('scrollintoview @ref dispatches geometry-based swipe series', async () => { assert.equal(stored?.actions[0]?.command, 'scrollintoview'); const result = (stored?.actions[0]?.result ?? {}) as Record; assert.equal(result.ref, 'e2'); - assert.equal(result.strategy, 'ref-geometry'); - assert.equal(result.verified, true); }); test('scrollintoview @ref returns immediately when target is already in viewport safe band', async () => { @@ -313,7 +299,7 @@ test('scrollintoview @ref returns immediately when target is already in viewport } }); -test('scrollintoview @ref fails if target remains outside viewport after scroll', async () => { +test('scrollintoview @ref does not run post-scroll verification snapshot', async () => { const sessionStore = makeSessionStore(); const sessionName = 'default'; const session = makeSession(sessionName); @@ -335,6 +321,7 @@ test('scrollintoview @ref fails if target remains outside viewport after scroll' backend: 'xctest', }; sessionStore.set(sessionName, session); + let snapshotCallCount = 0; const response = await handleInteractionCommands({ req: { @@ -349,6 +336,7 @@ test('scrollintoview @ref fails if target remains outside viewport after scroll' contextFromFlags, dispatch: async (_device, command) => { if (command === 'snapshot') { + snapshotCallCount += 1; return { nodes: [ { index: 0, type: 'Application', rect: { x: 0, y: 0, width: 390, height: 844 } }, @@ -362,9 +350,6 @@ test('scrollintoview @ref fails if target remains outside viewport after scroll' }); assert.ok(response); - assert.equal(response.ok, false); - if (!response.ok) { - assert.equal(response.error?.code, 'COMMAND_FAILED'); - assert.match(response.error?.message ?? '', /outside viewport/i); - } + assert.equal(response.ok, true); + assert.equal(snapshotCallCount, 0); }); diff --git a/src/daemon/handlers/interaction.ts b/src/daemon/handlers/interaction.ts index a3286806..dace7ccf 100644 --- a/src/daemon/handlers/interaction.ts +++ b/src/daemon/handlers/interaction.ts @@ -23,7 +23,7 @@ import { splitSelectorFromArgs, } from '../selectors.ts'; import { withDiagnosticTimer } from '../../utils/diagnostics.ts'; -import { buildScrollIntoViewPlan, isRectWithinSafeViewportBand, resolveViewportRect } from '../scroll-planner.ts'; +import { buildScrollIntoViewPlan, resolveViewportRect } from '../scroll-planner.ts'; type ContextFromFlags = ( flags: CommandFlags | undefined, @@ -598,14 +598,14 @@ export async function handleInteractionCommands(params: { command, positionals: req.positionals ?? [], flags: req.flags ?? {}, - result: { ref, attempts: 0, alreadyVisible: true, strategy: 'ref-geometry', refLabel, selectorChain }, + result: { ref, attempts: 0, alreadyVisible: true, refLabel, selectorChain }, }); - return { ok: true, data: { ref, attempts: 0, alreadyVisible: true, strategy: 'ref-geometry' } }; + return { ok: true, data: { ref, attempts: 0, alreadyVisible: true } }; } const data = await dispatch( session.device, 'swipe', - [String(plan.x), String(plan.startY), String(plan.x), String(plan.endY), '60'], + [String(plan.x), String(plan.startY), String(plan.x), String(plan.endY), '16'], req.flags?.out, { ...contextFromFlags(req.flags, session.appBundleId, session.trace?.outPath), @@ -614,15 +614,6 @@ export async function handleInteractionCommands(params: { pattern: 'one-way', }, ); - const verification = await verifyRefTargetInViewport({ - session, - flags: req.flags, - sessionStore, - contextFromFlags, - dispatch, - selectorChain, - }); - if (!verification.ok) return verification.response; sessionStore.recordAction(session, { command, positionals: req.positionals ?? [], @@ -632,8 +623,6 @@ export async function handleInteractionCommands(params: { ref, attempts: plan.count, direction: plan.direction, - strategy: 'ref-geometry', - verified: true, refLabel, selectorChain, }, @@ -645,8 +634,6 @@ export async function handleInteractionCommands(params: { ref, attempts: plan.count, direction: plan.direction, - strategy: 'ref-geometry', - verified: true, }, }; } @@ -761,74 +748,3 @@ function resolveRefTarget(params: { } return { ok: true, target: { ref, node, snapshotNodes: session.snapshot.nodes } }; } - -async function verifyRefTargetInViewport(params: { - session: SessionState; - flags: CommandFlags | undefined; - sessionStore: SessionStore; - contextFromFlags: ContextFromFlags; - dispatch: typeof dispatchCommand; - selectorChain: string[]; -}): Promise<{ ok: true } | { ok: false; response: DaemonResponse }> { - const { session, flags, sessionStore, contextFromFlags, dispatch, selectorChain } = params; - if (selectorChain.length === 0) { - return { - ok: false, - response: { ok: false, error: { code: 'COMMAND_FAILED', message: 'scrollintoview verification selector is empty' } }, - }; - } - let chainExpression = ''; - try { - chainExpression = selectorChain.join(' || '); - parseSelectorChain(chainExpression); - } catch { - return { - ok: false, - response: { ok: false, error: { code: 'COMMAND_FAILED', message: 'scrollintoview verification selector is invalid' } }, - }; - } - const snapshot = await captureSnapshotForSession( - session, - flags, - sessionStore, - contextFromFlags, - { interactiveOnly: true }, - dispatch, - ); - const chain = parseSelectorChain(chainExpression); - const resolved = resolveSelectorChain(snapshot.nodes, chain, { - platform: session.device.platform, - requireRect: true, - requireUnique: false, - disambiguateAmbiguous: true, - }); - if (!resolved?.node.rect) { - return { - ok: false, - response: { - ok: false, - error: { code: 'COMMAND_FAILED', message: 'scrollintoview target could not be verified after scrolling' }, - }, - }; - } - const viewportRect = resolveViewportRect(snapshot.nodes, resolved.node.rect); - if (!viewportRect) { - return { - ok: false, - response: { - ok: false, - error: { code: 'COMMAND_FAILED', message: 'scrollintoview could not infer viewport during verification' }, - }, - }; - } - if (!isRectWithinSafeViewportBand(resolved.node.rect, viewportRect)) { - return { - ok: false, - response: { - ok: false, - error: { code: 'COMMAND_FAILED', message: 'scrollintoview target is still outside viewport after scrolling' }, - }, - }; - } - return { ok: true }; -} diff --git a/src/daemon/scroll-planner.ts b/src/daemon/scroll-planner.ts index 1ca9fb94..e3d0503c 100644 --- a/src/daemon/scroll-planner.ts +++ b/src/daemon/scroll-planner.ts @@ -38,22 +38,27 @@ export function resolveViewportRect(nodes: RawSnapshotNode[], targetRect: Rect): export function buildScrollIntoViewPlan(targetRect: Rect, viewportRect: Rect): ScrollIntoViewPlan | null { const viewportHeight = Math.max(1, viewportRect.height); + const viewportWidth = Math.max(1, viewportRect.width); const viewportTop = viewportRect.y; const viewportBottom = viewportRect.y + viewportHeight; + const viewportLeft = viewportRect.x; + const viewportRight = viewportRect.x + viewportWidth; const safeTop = viewportTop + viewportHeight * 0.25; const safeBottom = viewportBottom - viewportHeight * 0.25; + const lanePaddingPx = Math.max(8, viewportWidth * 0.1); const targetCenterY = targetRect.y + targetRect.height / 2; + const targetCenterX = targetRect.x + targetRect.width / 2; if (targetCenterY >= safeTop && targetCenterY <= safeBottom) { return null; } - const x = Math.round(viewportRect.x + viewportRect.width / 2); - const dragUpStartY = Math.round(viewportTop + viewportHeight * 0.78); - const dragUpEndY = Math.round(viewportTop + viewportHeight * 0.22); + const x = Math.round(clamp(targetCenterX, viewportLeft + lanePaddingPx, viewportRight - lanePaddingPx)); + const dragUpStartY = Math.round(viewportTop + viewportHeight * 0.86); + const dragUpEndY = Math.round(viewportTop + viewportHeight * 0.14); const dragDownStartY = dragUpEndY; const dragDownEndY = dragUpStartY; - const swipeStepPx = Math.max(1, Math.abs(dragUpStartY - dragUpEndY) * 0.9); + const swipeStepPx = Math.max(1, Math.abs(dragUpStartY - dragUpEndY)); if (targetCenterY > safeBottom) { const delta = targetCenterY - safeBottom; @@ -111,3 +116,7 @@ function pickLargestRect(rects: Rect[]): Rect | null { function clampInt(value: number, min: number, max: number): number { return Math.min(max, Math.max(min, Math.round(value))); } + +function clamp(value: number, min: number, max: number): number { + return Math.min(max, Math.max(min, value)); +} diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 2f363cec..4caf397a 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -67,8 +67,8 @@ agent-device pinch 0.5 200 400 # zoom out at coordinates (iOS simulator) `fill` clears then types. `type` does not clear. On Android, `fill` also verifies text and performs one clear-and-retry pass on mismatch. `swipe` accepts an optional `durationMs` argument (default `250ms`, range `16..10000`). -On iOS, swipe timing uses a safe normalized duration to avoid longpress side effects. -`scrollintoview` accepts plain text or a snapshot ref (`@eN`); ref mode uses geometry-based scrolling. +On iOS, swipe duration is clamped to a safe range (`16..60ms`) to avoid longpress side effects. +`scrollintoview` accepts plain text or a snapshot ref (`@eN`); ref mode uses best-effort geometry-based scrolling without post-scroll verification. Run `snapshot` again before follow-up `@ref` commands. `longpress` is supported on iOS and Android. `pinch` is iOS simulator-only.