AppClaw ships with guides for the most commonly automated apps on both Android and iOS.
- These activate automatically when AppClaw detects the matching package name or bundle ID.
+ These activate automatically when AppClaw detects the matching package name or bundle
+ ID.
@@ -3586,7 +3589,8 @@ Example: WhatsApp Guide
APP_GUIDE (WhatsApp)
- ## WhatsApp Navigation
+
+## WhatsApp Navigation
- Bottom tabs: Chats | Updates | Communities | Calls
- New chat: floating pencil/message icon (bottom-right)
- Search: magnifying-glass icon at the top of Chats
@@ -3601,7 +3605,8 @@ Example: WhatsApp Guide
- Star a message: long-press message → star icon
- Forward: long-press message → forward arrow
- Delete: long-press message → trash icon
-- Group info: tap the group name at the top of the chat
+- Group info: tap the group name at the top of the chat
Example: YouTube Guide
@@ -3610,7 +3615,8 @@ Example: YouTube Guide
APP_GUIDE (YouTube)
- ## YouTube Navigation
+
+## YouTube Navigation
- Bottom nav: Home | Shorts | + (upload) | Subscriptions | Library
- Search: magnifying-glass icon (top-right)
- Tap a video thumbnail to play; double-tap left/right to seek ±10 s
@@ -3622,7 +3628,8 @@ Example: YouTube Guide
## Playback
- Full screen: rotate device or tap the expand icon (bottom-right of player)
- Quality: tap ⋮ inside player → Quality
-- Captions: tap CC icon inside player
+- Captions: tap CC icon inside player
@@ -3638,8 +3645,8 @@ Custom Guides
Custom guides always win
- If a custom guide exists for an app ID, it replaces the built-in entirely. To extend
- a built-in guide, copy its contents into your custom file and add your own sections.
+ If a custom guide exists for an app ID, it replaces the built-in entirely. To extend a
+ built-in guide, copy its contents into your custom file and add your own sections.
@@ -3658,7 +3665,8 @@ Creating a custom guide
.appclaw/guides/com.myapp.android.md
- ## Main Navigation
+
+## Main Navigation
- Bottom tabs: Home | Search | Orders | Profile
- Hamburger menu (top-left) → categories and account settings
@@ -3669,7 +3677,8 @@ Creating a custom guide
## Product Search
- Tap the search bar at the top; supports filters: Brand | Price | Rating
-- Long-press any product thumbnail to preview without navigating away
+- Long-press any product thumbnail to preview without navigating away
diff --git a/src/agent/planner.ts b/src/agent/planner.ts
index 3c4d697..5536c9e 100644
--- a/src/agent/planner.ts
+++ b/src/agent/planner.ts
@@ -36,6 +36,8 @@ export interface SubGoal {
dependsOn?: number;
status: 'pending' | 'in_progress' | 'completed' | 'failed';
result?: string;
+ /** The actual goal text that was executed (may differ from `goal` if the orchestrator rewrote it) */
+ executedAs?: string;
}
export interface PlannerResult {
@@ -399,8 +401,11 @@ Your job is to decide ONE of three actions:
- Sub-goal is "Open Settings" but Settings is already open (DOM shows Settings screen elements)
- Sub-goal is "Navigate to WiFi settings" but WiFi settings are already visible in DOM
- Sub-goal is "Enter email address" but the address is already present in the field in the DOM
+- Sub-goal is "Search/navigate/filter to reach X" but X is ALREADY VISIBLE on screen — the method of getting there is irrelevant, skip it
Only skip when there is CONCRETE evidence in the DOM/screenshot — never skip based on assumptions.
+KEY INSIGHT: Always evaluate whether the DESIRED OUTCOME of a sub-goal is already present on screen, not just whether the specific action was taken. The method (search, navigate, filter, scroll) is irrelevant if the end result is already achieved.
+
**rewrite** — The sub-goal needs adaptation because the screen state is different than expected. For example:
- Sub-goal is "Navigate to X" but X is already visible — rewrite to the actual action needed
- An overlay/dropdown/dialog is blocking the intended action — read the DOM, find the specific element to dismiss it, and include that in the rewritten goal
@@ -414,6 +419,7 @@ Rules:
- Read the DOM carefully to understand what screen the device is currently showing
- Check for blockers: Is a keyboard visible? Is an autocomplete dropdown open? Is a dialog showing?
- Be aggressive about skipping — if the screen already shows the desired state, skip
+- Evaluate OUTCOMES, not actions: if the desired result of a step is already on screen, skip — regardless of whether the specific action (search, navigate, filter, scroll) was taken
- When rewriting, make the new goal specific to what ACTUALLY needs to happen from the current screen
- CRITICAL: When rewriting, READ THE DOM and reference the SPECIFIC element to interact with by its text/desc from the DOM. Do NOT give vague instructions.
- NEVER rewrite a sub-goal to include work from ALREADY COMPLETED sub-goals. Only cover what THIS sub-goal needs to do.
diff --git a/src/flow/llm-parser.ts b/src/flow/llm-parser.ts
index 2d1ac82..3ed7236 100644
--- a/src/flow/llm-parser.ts
+++ b/src/flow/llm-parser.ts
@@ -56,6 +56,15 @@ const stepSchema = z.discriminatedUnion('kind', [
z.object({ kind: z.literal('getInfo'), query: z.string() }),
z.object({ kind: z.literal('done'), message: z.string().optional() }),
z.object({ kind: z.literal('launchApp') }),
+ z.object({
+ kind: z.literal('zoom'),
+ scale: z
+ .number()
+ .describe(
+ 'Scale factor: > 1 = zoom in, < 1 = zoom out. e.g. 2.0 = 2x zoom in, 0.5 = zoom out'
+ ),
+ target: z.string().optional().describe('Optional element label to zoom on'),
+ }),
]);
const SYSTEM_PROMPT =
@@ -71,6 +80,8 @@ const SYSTEM_PROMPT =
`- "wait seconds" → wait\n` +
`- "drag/slide/move X to Y" → drag (from=X, to=Y)\n` +
`- "swipe/scroll " → swipe\n` +
+ `- "zoom in [Nx] [on/into/the ]" → zoom (scale > 1), "zoom out [on ]" → zoom (scale < 1). e.g. "zoom in the map", "zoom in 2x on the image"\n` +
+ `- "pinch in/out [on/into/the ]" → zoom\n` +
`- "verify/check/assert " → assert\n` +
`- "scroll until visible" → scrollAssert\n` +
`- "go back" → back, "go home" → home\n` +
diff --git a/src/flow/natural-line.ts b/src/flow/natural-line.ts
index 6facdda..5344312 100644
--- a/src/flow/natural-line.ts
+++ b/src/flow/natural-line.ts
@@ -157,6 +157,31 @@ export function tryParseNaturalFlowLine(line: string): FlowStep | null {
return { kind: 'swipe', direction, ...(repeat && repeat > 1 ? { repeat } : {}), verbatim };
}
+ // "zoom in [on X]" / "zoom out [on X]" / "pinch in [on X]" / "pinch out [on X]"
+ // "zoom in 2x [on X]" / "zoom out 50% [on X]" / "zoom in the map" (no "on/into")
+ const zoomMatch = t.match(
+ /^(?:zoom|pinch)\s+(in|out)(?:\s+(\d+(?:\.\d+)?)\s*(?:x|times?|%)?)?(?:\s+(?:(?:on|into)\s+)?(?:the\s+)?(.+))?$/i
+ );
+ if (zoomMatch) {
+ const direction = zoomMatch[1].toLowerCase();
+ const rawFactor = zoomMatch[2] ? Number(zoomMatch[2]) : undefined;
+ const target = zoomMatch[3] ? trimPunct(zoomMatch[3].trim()) : undefined;
+ // Determine scale: zoom in > 1, zoom out < 1
+ let scale: number;
+ if (rawFactor !== undefined) {
+ const isPercent = zoomMatch[0].match(/\d+\s*%/);
+ if (isPercent) {
+ // "zoom out 50%" → scale = 0.5, "zoom in 200%" → scale = 2.0
+ scale = direction === 'out' ? rawFactor / 100 : rawFactor / 100;
+ } else {
+ // "zoom in 2x" → scale = 2.0, "zoom out 2x" → scale = 0.5
+ scale = direction === 'out' ? 1 / rawFactor : rawFactor;
+ }
+ } else {
+ scale = direction === 'out' ? 0.5 : 2.0;
+ }
+ return { kind: 'zoom', scale, ...(target ? { target } : {}), verbatim };
+ }
// ── waitUntil: "wait until screen is loaded", "wait until is visible/gone" ──
// Also: "wait 5s until ..." / "wait 10 seconds until ..."
diff --git a/src/flow/parse-yaml-flow.ts b/src/flow/parse-yaml-flow.ts
index 9b202af..0f4dfac 100644
--- a/src/flow/parse-yaml-flow.ts
+++ b/src/flow/parse-yaml-flow.ts
@@ -106,6 +106,18 @@ export function normalizeStructured(raw: unknown, index: number): FlowStep | nul
};
}
+ // ── Multi-key: zoom { scale, target? } ──
+ if (keys.includes('zoom') || (keys.includes('scale') && !keys.includes('from'))) {
+ const scaleVal = o.zoom !== undefined ? Number(o.zoom) : Number(o.scale);
+ if (!Number.isFinite(scaleVal) || scaleVal <= 0) {
+ throw new Error(
+ `Step ${index + 1}: zoom scale must be a positive number (e.g. 2.0 = zoom in 2x, 0.5 = zoom out)`
+ );
+ }
+ const target = o.target != null ? String(o.target).trim() : undefined;
+ return { kind: 'zoom', scale: scaleVal, ...(target ? { target } : {}) };
+ }
+
// ── Multi-key: drag { from, to } ──
if (keys.includes('from') && keys.includes('to')) {
const duration = o.duration != null ? Number(o.duration) : undefined;
@@ -194,6 +206,15 @@ export function normalizeStructured(raw: unknown, index: number): FlowStep | nul
);
}
if (k === 'tap') return { kind: 'tap', label: String(v) };
+ if (k === 'zoom') {
+ const scale = Number(v);
+ if (!Number.isFinite(scale) || scale <= 0) {
+ throw new Error(
+ `Step ${index + 1}: zoom scale must be a positive number (e.g. 2.0 = zoom in 2x, 0.5 = zoom out)`
+ );
+ }
+ return { kind: 'zoom', scale };
+ }
if (k === 'type') return { kind: 'type', text: String(v) };
if (k === 'assert' || k === 'verify' || k === 'check') {
return { kind: 'assert', text: String(v) };
diff --git a/src/flow/run-yaml-flow.ts b/src/flow/run-yaml-flow.ts
index 9b93fac..571b0cd 100644
--- a/src/flow/run-yaml-flow.ts
+++ b/src/flow/run-yaml-flow.ts
@@ -148,6 +148,8 @@ function stepLabel(step: FlowStep): string {
return 'goHome';
case 'swipe':
return `swipe ${step.direction}`;
+ case 'zoom':
+ return `zoom ${step.scale >= 1 ? 'in' : 'out'} (${step.scale}x)${step.target ? ` on "${step.target}"` : ''}`;
case 'drag':
return `drag "${step.from}" to "${step.to}"`;
case 'assert':
@@ -1014,6 +1016,68 @@ export async function executeStep(
message: count > 1 ? `Swiped ${dir} ${count} times` : `Swiped ${dir}`,
};
}
+ case 'zoom': {
+ // Resolve optional target to coordinates/UUID for pinch_zoom.
+ // Vision mode: use df-vision to locate the element by description.
+ // DOM mode: parse page source and find element UUID.
+ let elementUUID: string | undefined;
+ if (step.target) {
+ try {
+ if (isVisionMode() || isVisionLocateEnabled()) {
+ // Vision path: locate element → get synthetic ai-element UUID with coordinates
+ const visionUuid = await findElementByVision(mcp, step.target);
+ if (visionUuid) elementUUID = visionUuid;
+ } else {
+ // DOM path: parse page source → find element UUID
+ const { findByIdStrategies: zoomFindById } = await import('../agent/element-finder.js');
+ const pageSource = await getPageSource(mcp);
+ const { detectPlatform: zoomDetectPlatform } = await import('../perception/screen.js');
+ const { parseAndroidPageSource: zoomParseAndroid } =
+ await import('../perception/android-parser.js');
+ const { parseIOSPageSource: zoomParseIOS } =
+ await import('../perception/ios-parser.js');
+ const platform = zoomDetectPlatform(pageSource);
+ const elements =
+ platform === 'android' ? zoomParseAndroid(pageSource) : zoomParseIOS(pageSource);
+ const scored = elements
+ .map((el) => ({ el, s: scoreTapMatch(el, step.target!) }))
+ .filter((x) => x.s >= 0)
+ .sort((a, b) => a.s - b.s);
+ const pick = scored[0]?.el;
+ if (pick) {
+ elementUUID =
+ (await zoomFindById(mcp, pick.accessibilityId || pick.id, pick.text)) ?? undefined;
+ }
+ }
+ } catch {
+ // Non-fatal: fall back to screen-center zoom
+ }
+ }
+
+ let pinchArgs: Record = { action: 'pinch_zoom', scale: step.scale };
+ if (elementUUID && !isAIElement(elementUUID)) {
+ // Only pass real Appium element UUIDs — ai-element: synthetic UUIDs are not
+ // in Appium's element cache and will cause a 404 in the pinch handler.
+ pinchArgs.elementUUID = elementUUID;
+ }
+
+ const zoomResult = await mcp.callTool('appium_gesture', pinchArgs);
+ const zoomText =
+ zoomResult.content
+ ?.map((c: { type: string; text?: string }) => (c.type === 'text' ? c.text : ''))
+ .join('') ?? '';
+ const zoomFailed =
+ zoomText.toLowerCase().includes('failed') || zoomText.toLowerCase().includes('error');
+ if (zoomFailed) {
+ return { success: false, message: zoomText.slice(0, 200) };
+ }
+ const direction = step.scale >= 1 ? 'in' : 'out';
+ const targetDesc = step.target ? ` on "${step.target}"` : '';
+ return {
+ success: true,
+ message: `Zoomed ${direction} (scale=${step.scale})${targetDesc}`,
+ };
+ }
case 'drag': {
const dragApiKey = getStarkVisionApiKey();
const dragBaseUrl = getStarkVisionBaseUrl();
diff --git a/src/flow/types.ts b/src/flow/types.ts
index efeceae..c776233 100644
--- a/src/flow/types.ts
+++ b/src/flow/types.ts
@@ -71,6 +71,13 @@ export type FlowStep =
| ({ kind: 'back' } & Verbatim)
| ({ kind: 'home' } & Verbatim)
| ({ kind: 'swipe'; direction: 'up' | 'down' | 'left' | 'right'; repeat?: number } & Verbatim)
+ | ({
+ kind: 'zoom';
+ /** > 1 = zoom in (pinch open), < 1 = zoom out (pinch close). e.g. 2.0 = 2x zoom in, 0.5 = zoom out */
+ scale: number;
+ /** Optional label of the element to zoom on. If omitted, zooms on the center of the screen. */
+ target?: string;
+ } & Verbatim)
| ({
kind: 'drag';
from: string;
diff --git a/src/flow/vision-execute.ts b/src/flow/vision-execute.ts
index 3d75288..258e423 100644
--- a/src/flow/vision-execute.ts
+++ b/src/flow/vision-execute.ts
@@ -403,6 +403,25 @@ function preCheck(instruction: string): PreCheckResult | null {
};
}
+ // 5c. zoom / pinch — "zoom in [Nx] [on/into] [the] X", "zoom out [on X]", "pinch in/out [on X]"
+ // "on/into" is optional so "zoom in the map" works as well as "zoom in on the map"
+ const zoomMatch = t.match(
+ /^(?:zoom|pinch)\s+(in|out)(?:\s+(\d+(?:\.\d+)?)\s*(?:x|times?|%)?)?(?:\s+(?:(?:on|into)\s+)?(?:the\s+)?(.+))?$/i
+ );
+ if (zoomMatch) {
+ const direction = zoomMatch[1].toLowerCase();
+ const rawFactor = zoomMatch[2] ? Number(zoomMatch[2]) : undefined;
+ const target = zoomMatch[3] ? zoomMatch[3].replace(/[.!?]+$/g, '').trim() : undefined;
+ let scale: number;
+ if (rawFactor !== undefined) {
+ const isPercent = zoomMatch[0].match(/\d+\s*%/);
+ scale = isPercent ? rawFactor / 100 : direction === 'out' ? 1 / rawFactor : rawFactor;
+ } else {
+ scale = direction === 'out' ? 0.5 : 2.0;
+ }
+ return { step: { kind: 'zoom', scale, ...(target ? { target } : {}), verbatim: t } };
+ }
+
// 6. Visibility assert — any instruction starting with an assert/verify verb,
// or "is X visible?" pattern. Pass the full instruction to the vision model
// as-is — let the LLM interpret what to check instead of brittle regex parsing.
@@ -472,8 +491,12 @@ export async function visionExecute(
// ── Pre-check: non-visual instructions ──
const pre = preCheck(instruction);
if (pre?.step) {
- // scrollAssert and waitUntil need executeStep for their polling/scroll logic
- if (pre.step.kind === 'scrollAssert' || pre.step.kind === 'waitUntil') {
+ // These step kinds need executeStep for their device gesture / polling logic
+ if (
+ pre.step.kind === 'scrollAssert' ||
+ pre.step.kind === 'waitUntil' ||
+ pre.step.kind === 'zoom'
+ ) {
return { step: pre.step, result: { success: false, message: '__needs_executeStep__' } };
}
// Other pre-check steps — let caller fall through to classifyInstruction → executeStep
diff --git a/src/index.ts b/src/index.ts
index dd62e17..f6326d6 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -958,13 +958,13 @@ async function main() {
const prevGoal = executor.all[subGoalIdx - 1];
const completedGoalsList = executor.all
.filter((sg) => sg.status === 'completed')
- .map((sg) => `${sg.goal} → ${sg.result}`);
+ .map((sg) => `${sg.executedAs ?? sg.goal} → ${sg.result}`);
const [readiness, decision] = await Promise.all([
prevGoal
? assessScreenReadiness(
plannerModel,
- prevGoal.goal,
+ prevGoal.executedAs ?? prevGoal.goal,
subGoal.goal,
orchestratorDom,
thinkingOptions,
@@ -1052,6 +1052,9 @@ async function main() {
}
}
+ // Track the actual goal being executed so reconciliation uses the rewritten goal, not the original
+ subGoal.executedAs = effectiveGoal;
+
emitJson({
event: 'goal_start',
data: { goal: effectiveGoal, subGoalIndex: subGoalIdx, totalSubGoals: executor.all.length },
diff --git a/src/playground/index.ts b/src/playground/index.ts
index 8721496..fe6c348 100644
--- a/src/playground/index.ts
+++ b/src/playground/index.ts
@@ -90,6 +90,8 @@ function stepAction(step: FlowStep): string {
return 'type';
case 'swipe':
return 'swipe';
+ case 'zoom':
+ return 'zoom';
case 'wait':
return 'wait';
case 'waitUntil':
@@ -128,6 +130,8 @@ function stepTarget(step: FlowStep): string {
return `"${step.text}"${step.target ? ` → ${step.target}` : ''}`;
case 'swipe':
return step.direction;
+ case 'zoom':
+ return `${step.scale >= 1 ? 'in' : 'out'} (${step.scale}x)${step.target ? ` on "${step.target}"` : ''}`;
case 'wait':
return `${step.seconds}s`;
case 'waitUntil':
@@ -170,6 +174,8 @@ function spinnerDetail(step: FlowStep): string {
return 'typing into the field…';
case 'swipe':
return 'swiping the screen…';
+ case 'zoom':
+ return `zooming ${step.scale >= 1 ? 'in' : 'out'}…`;
case 'scrollAssert':
return 'scanning the screen…';
case 'assert':
@@ -260,6 +266,10 @@ function stepToYaml(step: FlowStep): unknown {
return `type "${step.text}"`;
case 'swipe':
return `swipe ${step.direction}`;
+ case 'zoom':
+ return step.target
+ ? `zoom ${step.scale >= 1 ? 'in' : 'out'} ${step.scale}x on ${step.target}`
+ : `zoom ${step.scale >= 1 ? 'in' : 'out'} ${step.scale}x`;
case 'wait':
return `wait ${step.seconds} s`;
case 'waitUntil':