From 96c89c31dea73f6d1253c1f779dc7b7e0eaba403 Mon Sep 17 00:00:00 2001 From: saikrishna321 Date: Sun, 26 Apr 2026 18:42:21 +0530 Subject: [PATCH] feat: add zoom in and out support Co-authored-by: Srinivasan Sekar --- .kiro/steering/product.md | 28 ++++++++++++ .kiro/steering/structure.md | 66 ++++++++++++++++++++++++++++ .kiro/steering/tech.md | 86 +++++++++++++++++++++++++++++++++++++ CHANGELOG.md | 4 +- landing/index.html | 52 +++++++++++----------- landing/usage.html | 43 +++++++++++-------- src/agent/planner.ts | 6 +++ src/flow/llm-parser.ts | 11 +++++ src/flow/natural-line.ts | 25 +++++++++++ src/flow/parse-yaml-flow.ts | 21 +++++++++ src/flow/run-yaml-flow.ts | 64 +++++++++++++++++++++++++++ src/flow/types.ts | 7 +++ src/flow/vision-execute.ts | 27 +++++++++++- src/index.ts | 7 ++- src/playground/index.ts | 10 +++++ 15 files changed, 409 insertions(+), 48 deletions(-) create mode 100644 .kiro/steering/product.md create mode 100644 .kiro/steering/structure.md create mode 100644 .kiro/steering/tech.md diff --git a/.kiro/steering/product.md b/.kiro/steering/product.md new file mode 100644 index 0000000..0bd7d6f --- /dev/null +++ b/.kiro/steering/product.md @@ -0,0 +1,28 @@ +# AppClaw — Product Overview + +AppClaw is an agentic AI layer for mobile automation on Android and iOS. Users describe goals in plain English and AppClaw orchestrates device interactions through Appium (via MCP). It supports multiple LLM providers (Anthropic, OpenAI, Google Gemini, Groq, Ollama) via the Vercel AI SDK. + +## Core Modes + +- **Agent mode** — LLM-driven goal execution (e.g. `appclaw "Send a WhatsApp message to Mom"`) +- **YAML flows** — declarative, zero-LLM automation steps defined in YAML files +- **Playground** — interactive REPL for building flows live on a device +- **Explorer** — generates YAML test flows from a PRD or app description +- **Record/Replay** — capture and adaptively replay goal executions +- **Report** — Express server serving HTML run reports + +## Two Agent Modes + +- `dom` — uses XML page source and accessibility IDs/XPath to locate elements +- `vision` — screenshot-first using Stark (df-vision + Gemini) for element location + +## Perception → Reason → Act Loop + +Each step: read screen state → send to LLM → execute action (tap/type/swipe/etc.) → repeat until goal complete or max steps reached. + +## Published Artifacts + +- **npm package** (`appclaw`) — CLI + SDK +- **VS Code extension** — live multi-device grid view +- **GitHub Action** — CI integration +- **Landing page** — Cloudflare Workers static site diff --git a/.kiro/steering/structure.md b/.kiro/steering/structure.md new file mode 100644 index 0000000..1b6bcc3 --- /dev/null +++ b/.kiro/steering/structure.md @@ -0,0 +1,66 @@ +# Project Structure + +## Root Layout + +``` +appclaw/ +├── src/ # All TypeScript source (compiled → dist/) +├── dist/ # Compiled output (mirrors src/, gitignored) +├── tests/ # Test files (vitest) +├── flows/ # Example YAML flow files +├── examples/ # Example flows and PRDs +├── schemas/ # JSON schemas (flow.schema.json, env.schema.json) +├── skills/ # AI agent skill definitions (generate-appclaw-flow, use-appclaw-cli) +├── bin/ # CLI entry point (bin/appclaw.js) +├── docs/ # QA documentation +├── logs/ # Runtime execution logs (gitignored) +├── vscode-extension/ # VS Code extension (separate package.json + tsconfig) +├── github-action/ # GitHub Action definition +├── landing/ # Cloudflare Workers landing page +└── .appclaw/ # Runtime data: guides/, runs/ (recordings, screenshots) +``` + +## Source Modules (`src/`) + +| Module | Responsibility | +| ---------------- | --------------------------------------------------------------------------------- | +| `index.ts` | CLI entry — routes to all 6 modes based on flags | +| `config.ts` | Zod-validated config from `.env` | +| `constants.ts` | Default models, pricing, stuck detection thresholds | +| `agent/` | Core agent loop, stuck detection, recovery, planner, human-in-the-loop | +| `llm/` | Multi-provider LLM integration — provider factory, prompt builder, action schemas | +| `mcp/` | Appium MCP client — tool calling, element finding, screenshots, keyboard | +| `perception/` | Screen parsing — Android/iOS XML parsers, DOM trimmer, screen diff | +| `vision/` | AI vision element location via Stark (df-vision + Gemini) | +| `flow/` | YAML flow parsing and execution, natural language step handling, parallel runner | +| `device/` | Device setup pipeline — platform/device picker, iOS setup, Appium session | +| `memory/` | Episodic memory — trajectory recording, fingerprinting, retrieval | +| `explorer/` | PRD → YAML flow generation, screen crawler | +| `playground/` | Interactive REPL for building flows | +| `recording/` | Session recorder and adaptive replayer | +| `report/` | Run artifact collection, HTML report rendering, Express server | +| `sdk/` | Public SDK — `GoalRunner`, `FlowRunner`, `StepRunner`, config builder | +| `skills/` | Built-in skill implementations (find-and-tap, read-screen, submit-message) | +| `ui/terminal.ts` | Rich terminal output — spinners, boxes, markdown rendering | +| `appguides/` | App-specific interaction guides | + +## Tests (`tests/`) + +``` +tests/ +├── flow/ # Flow parsing and execution unit tests +├── sdk/ # SDK integration tests +├── e2e/ # End-to-end device tests (require connected device) +├── vision/ # Vision module tests +└── flows/ # YAML flow fixtures used by tests +``` + +## Key Conventions + +- Each `src/` subdirectory typically has an `index.ts` as its public interface +- Types are co-located in `types.ts` within each module +- No barrel re-exports at the root `src/` level — import from specific modules +- The SDK (`src/sdk/`) is the only public API surface; everything else is internal +- YAML flows live in `flows/` (project-level) or `examples/flows/` (examples) +- `.appclaw/runs/` stores per-run artifacts: `manifest.json`, `recording.mp4`, step screenshots +- `.appclaw/guides/` stores per-app interaction guides keyed by bundle/package ID diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md new file mode 100644 index 0000000..04bb062 --- /dev/null +++ b/.kiro/steering/tech.md @@ -0,0 +1,86 @@ +# Tech Stack + +## Language & Runtime + +- **TypeScript** (strict mode, ES2022 modules) +- **Node.js** 18+ runtime +- **tsx** for dev/local execution without compiling + +## Build System + +- **TypeScript compiler** (`tsc`) — outputs to `dist/`, mirrors `src/` structure +- **No bundler** for the main package — pure tsc compilation +- **Vite** available in node_modules (used by VS Code extension) + +## Key Libraries + +- **Vercel AI SDK** (`ai`, `@ai-sdk/*`) — multi-provider LLM abstraction +- **appium-mcp** — Appium Model Context Protocol server (stdio or SSE transport) +- **@modelcontextprotocol/sdk** — MCP client +- **Zod** — schema validation (config, LLM responses, flow schemas) +- **yaml** — YAML flow file parsing +- **sharp** — image processing for screenshots +- **df-vision** — Stark vision element location (Gemini-backed) +- **dotenv** — `.env` config loading +- **express** — report server +- **hono** — MCP server HTTP layer +- **vitest** — test runner +- **prettier** — code formatting + +## LLM Providers + +Supported via Vercel AI SDK: `anthropic`, `openai`, `gemini`, `groq`, `ollama` + +## Code Style + +- Prettier config: single quotes, semi, 100 char print width, 2-space indent, trailing commas (ES5) +- No DI framework — modules import each other directly +- Zod for all external data validation +- Constants and model pricing centralized in `src/constants.ts` + +## Common Commands + +```bash +# Development +npm start # run via tsx (no compile) +npm start "goal" # run with a goal +npm run dev # run with file watching + +# Build & Type Check +npm run build # tsc → dist/ +npm run typecheck # type-check only, no emit +npm run lint # alias for typecheck + +# Formatting +npm run format # prettier --write +npm run format:check # prettier --check + +# Tests +npm test # vitest run tests/flow tests/sdk +npm run test:e2e # vitest run tests/e2e/ +npm run test:e2e:android # android e2e with MCP_DEBUG=1 +npm run test:watch # vitest watch mode + +# VS Code Extension +npm run build:vsix # build .vsix package + +# Landing page +npm run deploy:landing # deploy to Cloudflare Workers +``` + +## Configuration + +All runtime config via `.env`, validated by Zod schema in `src/config.ts`. Key variables: + +| Variable | Default | Description | +| ---------------- | -------- | ------------------------------------------------- | +| `LLM_PROVIDER` | `gemini` | `anthropic`, `openai`, `gemini`, `groq`, `ollama` | +| `LLM_API_KEY` | — | API key for chosen provider | +| `AGENT_MODE` | `dom` | `dom` or `vision` | +| `PLATFORM` | (prompt) | `android` or `ios` | +| `MAX_STEPS` | `30` | Max steps per goal | +| `CLOUD_PROVIDER` | — | `lambdatest` for remote devices | + +## Release + +Automated via **semantic-release** with conventional commits. Config in `.releaserc.json`. diff --git a/CHANGELOG.md b/CHANGELOG.md index e7f4120..6f6dc17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,13 @@ ### Bug Fixes -* update docs ([d55a5e3](https://github.com/AppiumTestDistribution/AppClaw/commit/d55a5e3117b9628a065fe48c5392ed1be739424d)) +- update docs ([d55a5e3](https://github.com/AppiumTestDistribution/AppClaw/commit/d55a5e3117b9628a065fe48c5392ed1be739424d)) ## [1.2.0](https://github.com/AppiumTestDistribution/AppClaw/compare/v1.1.0...v1.2.0) (2026-04-24) ### Features -* Appguide support ([#22](https://github.com/AppiumTestDistribution/AppClaw/issues/22)) ([63e366f](https://github.com/AppiumTestDistribution/AppClaw/commit/63e366feeb1f36c22643fff8d015f5b3b1253f6c)) +- Appguide support ([#22](https://github.com/AppiumTestDistribution/AppClaw/issues/22)) ([63e366f](https://github.com/AppiumTestDistribution/AppClaw/commit/63e366feeb1f36c22643fff8d015f5b3b1253f6c)) ## [1.1.0](https://github.com/AppiumTestDistribution/AppClaw/compare/v1.0.0...v1.1.0) (2026-04-17) diff --git a/landing/index.html b/landing/index.html index 56ec340..f951677 100644 --- a/landing/index.html +++ b/landing/index.html @@ -2056,28 +2056,21 @@

The agent knows
your app before
it opens it.

to reach any setting. No trial-and-error exploration.

-
- ✉️Gmail -
-
- ▶️YouTube -
-
- 💬WhatsApp -
-
- 🌐Chrome -
-
- ⚙️Settings -
+
✉️Gmail
+
▶️YouTube
+
💬WhatsApp
+
🌐Chrome
+
⚙️Settings
+ your app
📄 - Drop .appclaw/guides/<appId>.md to add or override any guide — custom guides always take priority over built-ins. + Drop .appclaw/guides/<appId>.md to add or override any guide — + custom guides always take priority over built-ins.
@@ -2091,19 +2084,28 @@

The agent knows
your app before
it opens it.

## WhatsApp Navigation
- - Bottom tabs: Chats | Updates | Communities | Calls
- - New chat: floating pencil icon (bottom-right)
- - Search: magnifying-glass at top of Chats
+ - + Bottom tabs: Chats | Updates | Communities | Calls
+ - + New chat: floating pencil icon (bottom-right)
+ - + Search: magnifying-glass at top of Chats

## Messaging
- - Open a chat → type in message bar → send via arrow
- - Attach media: paperclip icon next to message bar
- - Voice note: long-press the microphone icon
+ - + Open a chat → type in message bar → send via arrow
+ - + Attach media: paperclip icon next to message bar
+ - + Voice note: long-press the microphone icon

## Common Actions
- - Star a message: long-press → star icon
- - Forward: long-press → forward arrow
- - Group info: tap the group name at the top + - + Star a message: long-press → star icon
+ - + Forward: long-press → forward arrow
+ - + Group info: tap the group name at the top
diff --git a/landing/usage.html b/landing/usage.html index 43ca237..40ba4fb 100644 --- a/landing/usage.html +++ b/landing/usage.html @@ -3453,10 +3453,10 @@

Runner Requirements

App Guides / Overview

What are App Guides?

- App Guides (AppGuides) are per-app knowledge snippets injected directly into the - agent's context window at the start of every automation run. They encode navigation - patterns, gesture shortcuts, and common action paths for a specific app — so the agent - never needs to rediscover them by trial and error. + App Guides (AppGuides) are per-app knowledge snippets injected directly into the agent's + context window at the start of every automation run. They encode navigation patterns, + gesture shortcuts, and common action paths for a specific app — so the agent never needs + to rediscover them by trial and error.

@@ -3483,7 +3483,8 @@

How it works

System prompt injection (simplified)
-
APP_GUIDE (WhatsApp):
+            
+APP_GUIDE (WhatsApp):
 
 ## WhatsApp Navigation
 - Bottom tabs: Chats | Updates | Communities | Calls
@@ -3493,14 +3494,15 @@ 

How it works

## Messaging - Open a chat → type in the message bar at the bottom → send via arrow icon - Attach media: paperclip icon next to message bar -- Voice note: long-press the microphone icon
+- Voice note: long-press the microphone icon

Resolution order

  1. - Custom guide — - .appclaw/guides/<appId>.md (highest priority, overrides built-ins) + Custom guide.appclaw/guides/<appId>.md (highest + priority, overrides built-ins)
  2. Built-in guide — bundled guides for 10 common apps
  3. @@ -3515,7 +3517,8 @@

    Resolution order

    Built-in Guides

    AppClaw ships with guides for the most commonly automated apps on both Android and iOS. - These activate automatically when AppClaw detects the matching package name or bundle ID. + These activate automatically when AppClaw detects the matching package name or bundle + ID.

    @@ -3586,7 +3589,8 @@

    Example: WhatsApp Guide

    APP_GUIDE (WhatsApp) -
    ## WhatsApp Navigation
    +            
    +## WhatsApp Navigation
     - Bottom tabs: Chats | Updates | Communities | Calls
     - New chat: floating pencil/message icon (bottom-right)
     - Search: magnifying-glass icon at the top of Chats
    @@ -3601,7 +3605,8 @@ 

    Example: WhatsApp Guide

    - Star a message: long-press message → star icon - Forward: long-press message → forward arrow - Delete: long-press message → trash icon -- Group info: tap the group name at the top of the chat
    +- Group info: tap the group name at the top of the chat

    Example: YouTube Guide

    @@ -3610,7 +3615,8 @@

    Example: YouTube Guide

    APP_GUIDE (YouTube) -
    ## YouTube Navigation
    +            
    +## YouTube Navigation
     - Bottom nav: Home | Shorts | + (upload) | Subscriptions | Library
     - Search: magnifying-glass icon (top-right)
     - Tap a video thumbnail to play; double-tap left/right to seek ±10 s
    @@ -3622,7 +3628,8 @@ 

    Example: YouTube Guide

    ## Playback - Full screen: rotate device or tap the expand icon (bottom-right of player) - Quality: tap ⋮ inside player → Quality -- Captions: tap CC icon inside player
    +- Captions: tap CC icon inside player
    @@ -3638,8 +3645,8 @@

    Custom Guides

    Custom guides always win

    - If a custom guide exists for an app ID, it replaces the built-in entirely. To extend - a built-in guide, copy its contents into your custom file and add your own sections. + If a custom guide exists for an app ID, it replaces the built-in entirely. To extend a + built-in guide, copy its contents into your custom file and add your own sections.

    @@ -3658,7 +3665,8 @@

    Creating a custom guide

    .appclaw/guides/com.myapp.android.md -
    ## Main Navigation
    +            
    +## Main Navigation
     - Bottom tabs: Home | Search | Orders | Profile
     - Hamburger menu (top-left) → categories and account settings
     
    @@ -3669,7 +3677,8 @@ 

    Creating a custom guide

    ## Product Search - Tap the search bar at the top; supports filters: Brand | Price | Rating -- Long-press any product thumbnail to preview without navigating away
    +- Long-press any product thumbnail to preview without navigating away

    diff --git a/src/agent/planner.ts b/src/agent/planner.ts index 3c4d697..5536c9e 100644 --- a/src/agent/planner.ts +++ b/src/agent/planner.ts @@ -36,6 +36,8 @@ export interface SubGoal { dependsOn?: number; status: 'pending' | 'in_progress' | 'completed' | 'failed'; result?: string; + /** The actual goal text that was executed (may differ from `goal` if the orchestrator rewrote it) */ + executedAs?: string; } export interface PlannerResult { @@ -399,8 +401,11 @@ Your job is to decide ONE of three actions: - Sub-goal is "Open Settings" but Settings is already open (DOM shows Settings screen elements) - Sub-goal is "Navigate to WiFi settings" but WiFi settings are already visible in DOM - Sub-goal is "Enter email address" but the address is already present in the field in the DOM +- Sub-goal is "Search/navigate/filter to reach X" but X is ALREADY VISIBLE on screen — the method of getting there is irrelevant, skip it Only skip when there is CONCRETE evidence in the DOM/screenshot — never skip based on assumptions. +KEY INSIGHT: Always evaluate whether the DESIRED OUTCOME of a sub-goal is already present on screen, not just whether the specific action was taken. The method (search, navigate, filter, scroll) is irrelevant if the end result is already achieved. + **rewrite** — The sub-goal needs adaptation because the screen state is different than expected. For example: - Sub-goal is "Navigate to X" but X is already visible — rewrite to the actual action needed - An overlay/dropdown/dialog is blocking the intended action — read the DOM, find the specific element to dismiss it, and include that in the rewritten goal @@ -414,6 +419,7 @@ Rules: - Read the DOM carefully to understand what screen the device is currently showing - Check for blockers: Is a keyboard visible? Is an autocomplete dropdown open? Is a dialog showing? - Be aggressive about skipping — if the screen already shows the desired state, skip +- Evaluate OUTCOMES, not actions: if the desired result of a step is already on screen, skip — regardless of whether the specific action (search, navigate, filter, scroll) was taken - When rewriting, make the new goal specific to what ACTUALLY needs to happen from the current screen - CRITICAL: When rewriting, READ THE DOM and reference the SPECIFIC element to interact with by its text/desc from the DOM. Do NOT give vague instructions. - NEVER rewrite a sub-goal to include work from ALREADY COMPLETED sub-goals. Only cover what THIS sub-goal needs to do. diff --git a/src/flow/llm-parser.ts b/src/flow/llm-parser.ts index 2d1ac82..3ed7236 100644 --- a/src/flow/llm-parser.ts +++ b/src/flow/llm-parser.ts @@ -56,6 +56,15 @@ const stepSchema = z.discriminatedUnion('kind', [ z.object({ kind: z.literal('getInfo'), query: z.string() }), z.object({ kind: z.literal('done'), message: z.string().optional() }), z.object({ kind: z.literal('launchApp') }), + z.object({ + kind: z.literal('zoom'), + scale: z + .number() + .describe( + 'Scale factor: > 1 = zoom in, < 1 = zoom out. e.g. 2.0 = 2x zoom in, 0.5 = zoom out' + ), + target: z.string().optional().describe('Optional element label to zoom on'), + }), ]); const SYSTEM_PROMPT = @@ -71,6 +80,8 @@ const SYSTEM_PROMPT = `- "wait seconds" → wait\n` + `- "drag/slide/move X to Y" → drag (from=X, to=Y)\n` + `- "swipe/scroll " → swipe\n` + + `- "zoom in [Nx] [on/into/the ]" → zoom (scale > 1), "zoom out [on ]" → zoom (scale < 1). e.g. "zoom in the map", "zoom in 2x on the image"\n` + + `- "pinch in/out [on/into/the ]" → zoom\n` + `- "verify/check/assert " → assert\n` + `- "scroll until visible" → scrollAssert\n` + `- "go back" → back, "go home" → home\n` + diff --git a/src/flow/natural-line.ts b/src/flow/natural-line.ts index 6facdda..5344312 100644 --- a/src/flow/natural-line.ts +++ b/src/flow/natural-line.ts @@ -157,6 +157,31 @@ export function tryParseNaturalFlowLine(line: string): FlowStep | null { return { kind: 'swipe', direction, ...(repeat && repeat > 1 ? { repeat } : {}), verbatim }; } + // "zoom in [on X]" / "zoom out [on X]" / "pinch in [on X]" / "pinch out [on X]" + // "zoom in 2x [on X]" / "zoom out 50% [on X]" / "zoom in the map" (no "on/into") + const zoomMatch = t.match( + /^(?:zoom|pinch)\s+(in|out)(?:\s+(\d+(?:\.\d+)?)\s*(?:x|times?|%)?)?(?:\s+(?:(?:on|into)\s+)?(?:the\s+)?(.+))?$/i + ); + if (zoomMatch) { + const direction = zoomMatch[1].toLowerCase(); + const rawFactor = zoomMatch[2] ? Number(zoomMatch[2]) : undefined; + const target = zoomMatch[3] ? trimPunct(zoomMatch[3].trim()) : undefined; + // Determine scale: zoom in > 1, zoom out < 1 + let scale: number; + if (rawFactor !== undefined) { + const isPercent = zoomMatch[0].match(/\d+\s*%/); + if (isPercent) { + // "zoom out 50%" → scale = 0.5, "zoom in 200%" → scale = 2.0 + scale = direction === 'out' ? rawFactor / 100 : rawFactor / 100; + } else { + // "zoom in 2x" → scale = 2.0, "zoom out 2x" → scale = 0.5 + scale = direction === 'out' ? 1 / rawFactor : rawFactor; + } + } else { + scale = direction === 'out' ? 0.5 : 2.0; + } + return { kind: 'zoom', scale, ...(target ? { target } : {}), verbatim }; + } // ── waitUntil: "wait until screen is loaded", "wait until is visible/gone" ── // Also: "wait 5s until ..." / "wait 10 seconds until ..." diff --git a/src/flow/parse-yaml-flow.ts b/src/flow/parse-yaml-flow.ts index 9b202af..0f4dfac 100644 --- a/src/flow/parse-yaml-flow.ts +++ b/src/flow/parse-yaml-flow.ts @@ -106,6 +106,18 @@ export function normalizeStructured(raw: unknown, index: number): FlowStep | nul }; } + // ── Multi-key: zoom { scale, target? } ── + if (keys.includes('zoom') || (keys.includes('scale') && !keys.includes('from'))) { + const scaleVal = o.zoom !== undefined ? Number(o.zoom) : Number(o.scale); + if (!Number.isFinite(scaleVal) || scaleVal <= 0) { + throw new Error( + `Step ${index + 1}: zoom scale must be a positive number (e.g. 2.0 = zoom in 2x, 0.5 = zoom out)` + ); + } + const target = o.target != null ? String(o.target).trim() : undefined; + return { kind: 'zoom', scale: scaleVal, ...(target ? { target } : {}) }; + } + // ── Multi-key: drag { from, to } ── if (keys.includes('from') && keys.includes('to')) { const duration = o.duration != null ? Number(o.duration) : undefined; @@ -194,6 +206,15 @@ export function normalizeStructured(raw: unknown, index: number): FlowStep | nul ); } if (k === 'tap') return { kind: 'tap', label: String(v) }; + if (k === 'zoom') { + const scale = Number(v); + if (!Number.isFinite(scale) || scale <= 0) { + throw new Error( + `Step ${index + 1}: zoom scale must be a positive number (e.g. 2.0 = zoom in 2x, 0.5 = zoom out)` + ); + } + return { kind: 'zoom', scale }; + } if (k === 'type') return { kind: 'type', text: String(v) }; if (k === 'assert' || k === 'verify' || k === 'check') { return { kind: 'assert', text: String(v) }; diff --git a/src/flow/run-yaml-flow.ts b/src/flow/run-yaml-flow.ts index 9b93fac..571b0cd 100644 --- a/src/flow/run-yaml-flow.ts +++ b/src/flow/run-yaml-flow.ts @@ -148,6 +148,8 @@ function stepLabel(step: FlowStep): string { return 'goHome'; case 'swipe': return `swipe ${step.direction}`; + case 'zoom': + return `zoom ${step.scale >= 1 ? 'in' : 'out'} (${step.scale}x)${step.target ? ` on "${step.target}"` : ''}`; case 'drag': return `drag "${step.from}" to "${step.to}"`; case 'assert': @@ -1014,6 +1016,68 @@ export async function executeStep( message: count > 1 ? `Swiped ${dir} ${count} times` : `Swiped ${dir}`, }; } + case 'zoom': { + // Resolve optional target to coordinates/UUID for pinch_zoom. + // Vision mode: use df-vision to locate the element by description. + // DOM mode: parse page source and find element UUID. + let elementUUID: string | undefined; + if (step.target) { + try { + if (isVisionMode() || isVisionLocateEnabled()) { + // Vision path: locate element → get synthetic ai-element UUID with coordinates + const visionUuid = await findElementByVision(mcp, step.target); + if (visionUuid) elementUUID = visionUuid; + } else { + // DOM path: parse page source → find element UUID + const { findByIdStrategies: zoomFindById } = await import('../agent/element-finder.js'); + const pageSource = await getPageSource(mcp); + const { detectPlatform: zoomDetectPlatform } = await import('../perception/screen.js'); + const { parseAndroidPageSource: zoomParseAndroid } = + await import('../perception/android-parser.js'); + const { parseIOSPageSource: zoomParseIOS } = + await import('../perception/ios-parser.js'); + const platform = zoomDetectPlatform(pageSource); + const elements = + platform === 'android' ? zoomParseAndroid(pageSource) : zoomParseIOS(pageSource); + const scored = elements + .map((el) => ({ el, s: scoreTapMatch(el, step.target!) })) + .filter((x) => x.s >= 0) + .sort((a, b) => a.s - b.s); + const pick = scored[0]?.el; + if (pick) { + elementUUID = + (await zoomFindById(mcp, pick.accessibilityId || pick.id, pick.text)) ?? undefined; + } + } + } catch { + // Non-fatal: fall back to screen-center zoom + } + } + + let pinchArgs: Record = { action: 'pinch_zoom', scale: step.scale }; + if (elementUUID && !isAIElement(elementUUID)) { + // Only pass real Appium element UUIDs — ai-element: synthetic UUIDs are not + // in Appium's element cache and will cause a 404 in the pinch handler. + pinchArgs.elementUUID = elementUUID; + } + + const zoomResult = await mcp.callTool('appium_gesture', pinchArgs); + const zoomText = + zoomResult.content + ?.map((c: { type: string; text?: string }) => (c.type === 'text' ? c.text : '')) + .join('') ?? ''; + const zoomFailed = + zoomText.toLowerCase().includes('failed') || zoomText.toLowerCase().includes('error'); + if (zoomFailed) { + return { success: false, message: zoomText.slice(0, 200) }; + } + const direction = step.scale >= 1 ? 'in' : 'out'; + const targetDesc = step.target ? ` on "${step.target}"` : ''; + return { + success: true, + message: `Zoomed ${direction} (scale=${step.scale})${targetDesc}`, + }; + } case 'drag': { const dragApiKey = getStarkVisionApiKey(); const dragBaseUrl = getStarkVisionBaseUrl(); diff --git a/src/flow/types.ts b/src/flow/types.ts index efeceae..c776233 100644 --- a/src/flow/types.ts +++ b/src/flow/types.ts @@ -71,6 +71,13 @@ export type FlowStep = | ({ kind: 'back' } & Verbatim) | ({ kind: 'home' } & Verbatim) | ({ kind: 'swipe'; direction: 'up' | 'down' | 'left' | 'right'; repeat?: number } & Verbatim) + | ({ + kind: 'zoom'; + /** > 1 = zoom in (pinch open), < 1 = zoom out (pinch close). e.g. 2.0 = 2x zoom in, 0.5 = zoom out */ + scale: number; + /** Optional label of the element to zoom on. If omitted, zooms on the center of the screen. */ + target?: string; + } & Verbatim) | ({ kind: 'drag'; from: string; diff --git a/src/flow/vision-execute.ts b/src/flow/vision-execute.ts index 3d75288..258e423 100644 --- a/src/flow/vision-execute.ts +++ b/src/flow/vision-execute.ts @@ -403,6 +403,25 @@ function preCheck(instruction: string): PreCheckResult | null { }; } + // 5c. zoom / pinch — "zoom in [Nx] [on/into] [the] X", "zoom out [on X]", "pinch in/out [on X]" + // "on/into" is optional so "zoom in the map" works as well as "zoom in on the map" + const zoomMatch = t.match( + /^(?:zoom|pinch)\s+(in|out)(?:\s+(\d+(?:\.\d+)?)\s*(?:x|times?|%)?)?(?:\s+(?:(?:on|into)\s+)?(?:the\s+)?(.+))?$/i + ); + if (zoomMatch) { + const direction = zoomMatch[1].toLowerCase(); + const rawFactor = zoomMatch[2] ? Number(zoomMatch[2]) : undefined; + const target = zoomMatch[3] ? zoomMatch[3].replace(/[.!?]+$/g, '').trim() : undefined; + let scale: number; + if (rawFactor !== undefined) { + const isPercent = zoomMatch[0].match(/\d+\s*%/); + scale = isPercent ? rawFactor / 100 : direction === 'out' ? 1 / rawFactor : rawFactor; + } else { + scale = direction === 'out' ? 0.5 : 2.0; + } + return { step: { kind: 'zoom', scale, ...(target ? { target } : {}), verbatim: t } }; + } + // 6. Visibility assert — any instruction starting with an assert/verify verb, // or "is X visible?" pattern. Pass the full instruction to the vision model // as-is — let the LLM interpret what to check instead of brittle regex parsing. @@ -472,8 +491,12 @@ export async function visionExecute( // ── Pre-check: non-visual instructions ── const pre = preCheck(instruction); if (pre?.step) { - // scrollAssert and waitUntil need executeStep for their polling/scroll logic - if (pre.step.kind === 'scrollAssert' || pre.step.kind === 'waitUntil') { + // These step kinds need executeStep for their device gesture / polling logic + if ( + pre.step.kind === 'scrollAssert' || + pre.step.kind === 'waitUntil' || + pre.step.kind === 'zoom' + ) { return { step: pre.step, result: { success: false, message: '__needs_executeStep__' } }; } // Other pre-check steps — let caller fall through to classifyInstruction → executeStep diff --git a/src/index.ts b/src/index.ts index dd62e17..f6326d6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -958,13 +958,13 @@ async function main() { const prevGoal = executor.all[subGoalIdx - 1]; const completedGoalsList = executor.all .filter((sg) => sg.status === 'completed') - .map((sg) => `${sg.goal} → ${sg.result}`); + .map((sg) => `${sg.executedAs ?? sg.goal} → ${sg.result}`); const [readiness, decision] = await Promise.all([ prevGoal ? assessScreenReadiness( plannerModel, - prevGoal.goal, + prevGoal.executedAs ?? prevGoal.goal, subGoal.goal, orchestratorDom, thinkingOptions, @@ -1052,6 +1052,9 @@ async function main() { } } + // Track the actual goal being executed so reconciliation uses the rewritten goal, not the original + subGoal.executedAs = effectiveGoal; + emitJson({ event: 'goal_start', data: { goal: effectiveGoal, subGoalIndex: subGoalIdx, totalSubGoals: executor.all.length }, diff --git a/src/playground/index.ts b/src/playground/index.ts index 8721496..fe6c348 100644 --- a/src/playground/index.ts +++ b/src/playground/index.ts @@ -90,6 +90,8 @@ function stepAction(step: FlowStep): string { return 'type'; case 'swipe': return 'swipe'; + case 'zoom': + return 'zoom'; case 'wait': return 'wait'; case 'waitUntil': @@ -128,6 +130,8 @@ function stepTarget(step: FlowStep): string { return `"${step.text}"${step.target ? ` → ${step.target}` : ''}`; case 'swipe': return step.direction; + case 'zoom': + return `${step.scale >= 1 ? 'in' : 'out'} (${step.scale}x)${step.target ? ` on "${step.target}"` : ''}`; case 'wait': return `${step.seconds}s`; case 'waitUntil': @@ -170,6 +174,8 @@ function spinnerDetail(step: FlowStep): string { return 'typing into the field…'; case 'swipe': return 'swiping the screen…'; + case 'zoom': + return `zooming ${step.scale >= 1 ? 'in' : 'out'}…`; case 'scrollAssert': return 'scanning the screen…'; case 'assert': @@ -260,6 +266,10 @@ function stepToYaml(step: FlowStep): unknown { return `type "${step.text}"`; case 'swipe': return `swipe ${step.direction}`; + case 'zoom': + return step.target + ? `zoom ${step.scale >= 1 ? 'in' : 'out'} ${step.scale}x on ${step.target}` + : `zoom ${step.scale >= 1 ? 'in' : 'out'} ${step.scale}x`; case 'wait': return `wait ${step.seconds} s`; case 'waitUntil':