Skip to content
Merged
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ npx agent-device open SampleApp
## Quick Start

Use refs for agent-driven exploration and normal automation flows.
Use `press` as the canonical tap command; `click` is an equivalent alias.

```bash
agent-device open Contacts --platform ios # creates session on iOS Simulator
agent-device snapshot
agent-device click @e5
agent-device press @e5
agent-device fill @e6 "John"
agent-device fill @e7 "Doe"
agent-device click @e3
agent-device press @e3
agent-device close
```

Expand All @@ -56,7 +57,7 @@ Basic flow:
```bash
agent-device open SampleApp
agent-device snapshot
agent-device click @e7
agent-device press @e7
agent-device fill @e8 "hello"
agent-device close SampleApp
```
Expand All @@ -73,19 +74,22 @@ agent-device trace stop ./trace.log
Coordinates:
- All coordinate-based commands (`press`, `long-press`, `swipe`, `focus`, `fill`) use device coordinates with origin at top-left.
- X increases to the right, Y increases downward.
- `press` is the canonical tap command.
- `click` is an equivalent alias and accepts the same targets (`x y`, `@ref`, selector) and flags.

Gesture series examples:

```bash
agent-device press 300 500 --count 12 --interval-ms 45
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
agent-device press @e5 --count 5 --double-tap
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
```

## Command Index
- `boot`, `open`, `close`, `reinstall`, `home`, `back`, `app-switcher`
- `snapshot`, `find`, `get`
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
- `press` (alias: `click`), `focus`, `type`, `fill`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
- `alert`, `wait`, `screenshot`
- `trace start`, `trace stop`
- `settings wifi|airplane|location on|off`
Expand All @@ -110,6 +114,7 @@ Flags:
- `--interval-ms <ms>` delay between `press` iterations
- `--hold-ms <ms>` hold duration per `press` iteration
- `--jitter-px <n>` deterministic coordinate jitter for `press`
- `--double-tap` use a double-tap gesture per `press`/`click` iteration (cannot be combined with `--hold-ms` or `--jitter-px`)
- `--pause-ms <ms>` delay between `swipe` iterations
- `--pattern one-way|ping-pong` repeat pattern for `swipe`
- `--verbose` for daemon and runner logs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,20 +219,27 @@ final class RunnerTests: XCTestCase {
let normalizedBundleId = command.appBundleId?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedBundleId = (normalizedBundleId?.isEmpty == true) ? nil : normalizedBundleId
let switchedApp: Bool
if let bundleId = requestedBundleId, currentBundleId != bundleId {
let target = XCUIApplication(bundleIdentifier: bundleId)
NSLog("AGENT_DEVICE_RUNNER_ACTIVATE bundle=%@ state=%d", bundleId, target.state.rawValue)
// activate avoids terminating and relaunching the target app
target.activate()
currentApp = target
currentBundleId = bundleId
switchedApp = true
} else if requestedBundleId == nil {
// Do not reuse stale bundle targets when the caller does not explicitly request one.
currentApp = nil
currentBundleId = nil
switchedApp = false
} else {
switchedApp = false
}
let activeApp = currentApp ?? app
_ = activeApp.waitForExistence(timeout: 5)
if switchedApp {
_ = activeApp.waitForExistence(timeout: 5)
}

switch command.command {
case .shutdown:
Expand All @@ -250,6 +257,23 @@ final class RunnerTests: XCTestCase {
return Response(ok: true, data: DataPayload(message: "tapped"))
}
return Response(ok: false, error: ErrorPayload(message: "tap requires text or x/y"))
case .tapSeries:
guard let x = command.x, let y = command.y else {
return Response(ok: false, error: ErrorPayload(message: "tapSeries requires x and y"))
}
let count = max(Int(command.count ?? 1), 1)
let intervalMs = max(command.intervalMs ?? 0, 0)
let doubleTap = command.doubleTap ?? false
if doubleTap {
runSeries(count: count, pauseMs: intervalMs) { _ in
doubleTapAt(app: activeApp, x: x, y: y)
}
return Response(ok: true, data: DataPayload(message: "tap series"))
}
runSeries(count: count, pauseMs: intervalMs) { _ in
tapAt(app: activeApp, x: x, y: y)
}
return Response(ok: true, data: DataPayload(message: "tap series"))
case .longPress:
guard let x = command.x, let y = command.y else {
return Response(ok: false, error: ErrorPayload(message: "longPress requires x and y"))
Expand All @@ -264,6 +288,26 @@ final class RunnerTests: XCTestCase {
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
return Response(ok: true, data: DataPayload(message: "dragged"))
case .dragSeries:
guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else {
return Response(ok: false, error: ErrorPayload(message: "dragSeries requires x, y, x2, and y2"))
}
let count = max(Int(command.count ?? 1), 1)
let pauseMs = max(command.pauseMs ?? 0, 0)
let pattern = command.pattern ?? "one-way"
if pattern != "one-way" && pattern != "ping-pong" {
return Response(ok: false, error: ErrorPayload(message: "dragSeries pattern must be one-way or ping-pong"))
}
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
runSeries(count: count, pauseMs: pauseMs) { idx in
let reverse = pattern == "ping-pong" && (idx % 2 == 1)
if reverse {
dragAt(app: activeApp, x: x2, y: y2, x2: x, y2: y, holdDuration: holdDuration)
} else {
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
}
}
return Response(ok: true, data: DataPayload(message: "drag series"))
case .type:
guard let text = command.text else {
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
Expand Down Expand Up @@ -443,6 +487,12 @@ final class RunnerTests: XCTestCase {
coordinate.tap()
}

private func doubleTapAt(app: XCUIApplication, x: Double, y: Double) {
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
let coordinate = origin.withOffset(CGVector(dx: x, dy: y))
coordinate.doubleTap()
}

private func longPressAt(app: XCUIApplication, x: Double, y: Double, duration: TimeInterval) {
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
let coordinate = origin.withOffset(CGVector(dx: x, dy: y))
Expand All @@ -463,6 +513,17 @@ final class RunnerTests: XCTestCase {
start.press(forDuration: holdDuration, thenDragTo: end)
}

private func runSeries(count: Int, pauseMs: Double, operation: (Int) -> Void) {
let total = max(count, 1)
let pause = max(pauseMs, 0)
for idx in 0..<total {
operation(idx)
if idx < total - 1 && pause > 0 {
Thread.sleep(forTimeInterval: pause / 1000.0)
}
}
}

private func swipe(app: XCUIApplication, direction: SwipeDirection) {
let target = app.windows.firstMatch.exists ? app.windows.firstMatch : app
let start = target.coordinate(withNormalizedOffset: CGVector(dx: 0.5, dy: 0.2))
Expand Down Expand Up @@ -982,8 +1043,10 @@ private func resolveRunnerPort() -> UInt16 {

enum CommandType: String, Codable {
case tap
case tapSeries
case longPress
case drag
case dragSeries
case type
case swipe
case findText
Expand Down Expand Up @@ -1012,6 +1075,11 @@ struct Command: Codable {
let action: String?
let x: Double?
let y: Double?
let count: Double?
let intervalMs: Double?
let doubleTap: Bool?
let pauseMs: Double?
let pattern: String?
let x2: Double?
let y2: Double?
let durationMs: Double?
Expand Down
13 changes: 9 additions & 4 deletions skills/agent-device/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ For agent-driven exploration: use refs. For deterministic replay scripts: use se
```bash
agent-device open Settings --platform ios
agent-device snapshot -i
agent-device click @e3
agent-device press @e3
agent-device wait text "Camera"
agent-device alert wait 10000
agent-device fill @e5 "test"
Expand All @@ -29,7 +29,7 @@ npx -y agent-device

1. Open app or deep link: `open [app|url] [url]` (`open` handles target selection + boot/activation in the normal flow)
2. Snapshot: `snapshot` to get refs from accessibility tree
3. Interact using refs (`click @ref`, `fill @ref "text"`)
3. Interact using refs (`press @ref`, `fill @ref "text"`; `click` is an alias of `press`)
4. Re-snapshot after navigation/UI changes
5. Close session when done

Expand Down Expand Up @@ -109,13 +109,15 @@ agent-device appstate
### Interactions (use @refs from snapshot)

```bash
agent-device click @e1
agent-device press @e1 # Canonical tap command (`click` is an alias)
agent-device focus @e2
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
agent-device type "text" # Type into focused field without clearing
agent-device press 300 500 # Tap by coordinates
agent-device press 300 500 --count 12 --interval-ms 45
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
agent-device press @e1 --count 5 # Repeat taps on the same target
agent-device press @e1 --count 5 --double-tap # Use double-tap gesture per iteration
agent-device swipe 540 1500 540 500 120
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
agent-device long-press 300 500 800 # Long press (where supported)
Expand Down Expand Up @@ -178,7 +180,10 @@ agent-device apps --platform android --user-installed

## Best practices

- `press` supports gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`.
- `press` is the canonical tap command; `click` is an alias with the same behavior.
- `press` (and `click`) accepts `x y`, `@ref`, and selector targets.
- `press`/`click` support gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`, `--double-tap`.
- `--double-tap` cannot be combined with `--hold-ms` or `--jitter-px`.
- `swipe` supports coordinate + timing controls and repeat patterns: `swipe x1 y1 x2 y2 [durationMs] --count --pause-ms --pattern`.
- `swipe` timing is platform-safe: Android uses requested duration; iOS uses normalized safe timing to avoid long-press side effects.
- Pinch (`pinch <scale> [x y]`) is iOS simulator-only; scale > 1 zooms in, < 1 zooms out.
Expand Down
5 changes: 3 additions & 2 deletions skills/agent-device/references/snapshot-refs.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Purpose

Refs are useful for discovery/debugging. For deterministic scripts, use selectors.
For tap interactions, `press` is canonical; `click` is an equivalent alias.

## Snapshot

Expand All @@ -24,14 +25,14 @@ App: com.apple.Preferences
## Using refs (discovery/debug)

```bash
agent-device click @e2
agent-device press @e2
agent-device fill @e5 "test"
```

## Using selectors (deterministic)

```bash
agent-device click 'id="camera_row" || label="Camera" role=button'
agent-device press 'id="camera_row" || label="Camera" role=button'
agent-device fill 'id="search_input" editable=true' "test"
agent-device is visible 'id="camera_settings_anchor"'
```
Expand Down
4 changes: 2 additions & 2 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,12 @@ export async function runCli(argv: string[], deps: CliDeps = DEFAULT_CLI_DEPS):
if (logTailStopper) logTailStopper();
return;
}
if (command === 'click') {
if (command === 'click' || command === 'press') {
const ref = (response.data as any)?.ref ?? '';
const x = (response.data as any)?.x;
const y = (response.data as any)?.y;
if (ref && typeof x === 'number' && typeof y === 'number') {
process.stdout.write(`Clicked @${ref} (${x}, ${y})\n`);
process.stdout.write(`Tapped @${ref} (${x}, ${y})\n`);
}
if (logTailStopper) logTailStopper();
return;
Expand Down
43 changes: 43 additions & 0 deletions src/core/__tests__/dispatch-press.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { shouldUseIosDragSeries, shouldUseIosTapSeries } from '../dispatch.ts';
import type { DeviceInfo } from '../../utils/device.ts';

const iosDevice: DeviceInfo = {
platform: 'ios',
id: 'ios-1',
name: 'iPhone 15',
kind: 'simulator',
booted: true,
};

const androidDevice: DeviceInfo = {
platform: 'android',
id: 'android-1',
name: 'Pixel',
kind: 'emulator',
booted: true,
};

test('shouldUseIosTapSeries enables fast path for repeated plain iOS taps', () => {
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 0, 0), true);
});

test('shouldUseIosTapSeries disables fast path for single press or modified gestures', () => {
assert.equal(shouldUseIosTapSeries(iosDevice, 1, 0, 0), false);
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 100, 0), false);
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 0, 1), false);
});

test('shouldUseIosTapSeries disables fast path for non-iOS devices', () => {
assert.equal(shouldUseIosTapSeries(androidDevice, 5, 0, 0), false);
});

test('shouldUseIosDragSeries enables fast path for repeated iOS swipes', () => {
assert.equal(shouldUseIosDragSeries(iosDevice, 3), true);
});

test('shouldUseIosDragSeries disables fast path for single swipe and non-iOS', () => {
assert.equal(shouldUseIosDragSeries(iosDevice, 1), false);
assert.equal(shouldUseIosDragSeries(androidDevice, 3), false);
});
Loading