diff --git a/.github/workflows/action-test.yml b/.github/workflows/action-test.yml index 36585f1..856887a 100644 --- a/.github/workflows/action-test.yml +++ b/.github/workflows/action-test.yml @@ -8,7 +8,7 @@ on: push: branches: [main] paths: - - 'github-action/**' + - 'action.yml' - '.github/workflows/action-test.yml' - 'flows/**' workflow_dispatch: @@ -39,7 +39,7 @@ jobs: - uses: actions/checkout@v4 # Use the local action definition (same repo, same commit) - - uses: ./github-action + - uses: ./ id: run with: flow: ${{ github.event.inputs.flow || 'flows/youtube.yaml' }} @@ -61,7 +61,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: ./github-action + - uses: ./ id: run with: goal: 'Open YouTube app and verify the home feed is visible' @@ -84,7 +84,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: ./github-action + - uses: ./ id: run with: flow: ${{ github.event.inputs.flow || 'flows/youtube.yaml' }} diff --git a/.github/workflows/layer3-branch-test.yml b/.github/workflows/layer3-branch-test.yml index 14b6062..7afc711 100644 --- a/.github/workflows/layer3-branch-test.yml +++ b/.github/workflows/layer3-branch-test.yml @@ -30,7 +30,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: ./github-action + - uses: ./ id: run with: use-local-build: 'true' @@ -55,7 +55,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: ./github-action + - uses: ./ id: run with: use-local-build: 'true' @@ -75,17 +75,19 @@ jobs: ios-flow: name: iOS — YAML flow runs-on: macos-14 - if: github.event_name == 'pull_request' || (github.event_name == 'workflow_dispatch' && inputs.platform == 'ios') + if: false steps: - uses: actions/checkout@v4 - - uses: ./github-action + - uses: ./ id: run with: use-local-build: 'true' flow: ${{ inputs.flow || 'flows/wdio.yaml' }} platform: ios + ios-device-type: simulator + mcp-debug: 'true' provider: gemini agent-mode: vision api-key: ${{ secrets.LLM_API_KEY }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b59610..f785f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,24 +2,24 @@ ### Features -* add action.yml at repo root for GitHub Marketplace publishing ([#20](https://github.com/AppiumTestDistribution/AppClaw/issues/20)) ([c007399](https://github.com/AppiumTestDistribution/AppClaw/commit/c007399fa670273058cd51e65f0fd68323ccb3be)) +- add action.yml at repo root for GitHub Marketplace publishing ([#20](https://github.com/AppiumTestDistribution/AppClaw/issues/20)) ([c007399](https://github.com/AppiumTestDistribution/AppClaw/commit/c007399fa670273058cd51e65f0fd68323ccb3be)) ## 1.0.0 (2026-04-16) ### Features -* integrate ai-sdk-ollama for LLM support and update configuration ([#9](https://github.com/AppiumTestDistribution/AppClaw/issues/9)) ([c6794d7](https://github.com/AppiumTestDistribution/AppClaw/commit/c6794d718a37ef690c09f5fb006c8994c78e361b)) -* parallel testing support and screen recording for SDK ([#16](https://github.com/AppiumTestDistribution/AppClaw/issues/16)) ([7d14e7b](https://github.com/AppiumTestDistribution/AppClaw/commit/7d14e7b760c41783c61f1227c037e1b28d184a5c)) -* strict playground tap matching, waitUntil pre-check, faster vision assert ([59b8c29](https://github.com/AppiumTestDistribution/AppClaw/commit/59b8c299bf20c9232d89bbbb4d93a9ef600cca2b)) -* vision improvements — drag support, screenshot optimization, an… ([#7](https://github.com/AppiumTestDistribution/AppClaw/issues/7)) ([8cfbcb4](https://github.com/AppiumTestDistribution/AppClaw/commit/8cfbcb483fce0dec531ad8c21c8cd93d5743d62f)) +- integrate ai-sdk-ollama for LLM support and update configuration ([#9](https://github.com/AppiumTestDistribution/AppClaw/issues/9)) ([c6794d7](https://github.com/AppiumTestDistribution/AppClaw/commit/c6794d718a37ef690c09f5fb006c8994c78e361b)) +- parallel testing support and screen recording for SDK ([#16](https://github.com/AppiumTestDistribution/AppClaw/issues/16)) ([7d14e7b](https://github.com/AppiumTestDistribution/AppClaw/commit/7d14e7b760c41783c61f1227c037e1b28d184a5c)) +- strict playground tap matching, waitUntil pre-check, faster vision assert ([59b8c29](https://github.com/AppiumTestDistribution/AppClaw/commit/59b8c299bf20c9232d89bbbb4d93a9ef600cca2b)) +- vision improvements — drag support, screenshot optimization, an… ([#7](https://github.com/AppiumTestDistribution/AppClaw/issues/7)) ([8cfbcb4](https://github.com/AppiumTestDistribution/AppClaw/commit/8cfbcb483fce0dec531ad8c21c8cd93d5743d62f)) ### Bug Fixes -* add semantic-release for automated versioning and npm publishing ([#19](https://github.com/AppiumTestDistribution/AppClaw/issues/19)) ([66c73a6](https://github.com/AppiumTestDistribution/AppClaw/commit/66c73a677e763112c4fab80dd29301f3d2071532)) -* ci ([#10](https://github.com/AppiumTestDistribution/AppClaw/issues/10)) ([dfcd62f](https://github.com/AppiumTestDistribution/AppClaw/commit/dfcd62fa083d673c98fc0c381820c7dd58d36818)) -* DOM locator resolution, vision assert parsing, and appium-mcp coordinate scaling ([9272c36](https://github.com/AppiumTestDistribution/AppClaw/commit/9272c36b65e7bd996b730bb6d67d0fa6fee9518a)) -* read CLI version from package.json instead of hardcoded string ([#14](https://github.com/AppiumTestDistribution/AppClaw/issues/14)) ([fcb3a64](https://github.com/AppiumTestDistribution/AppClaw/commit/fcb3a6417ddc48d72d246bc9fd5dd1438020635d)) -* screenshot parsing ([e449a23](https://github.com/AppiumTestDistribution/AppClaw/commit/e449a2341fc67e193f1519bae16d4cace878bcfc)) -* scroll-aware stuck detection, press_enter tool, and post-done verification ([c03bbe4](https://github.com/AppiumTestDistribution/AppClaw/commit/c03bbe4222ce7fd7bba6867f7d1e59ac5ef3c8ee)) -* terminal UI ([294a780](https://github.com/AppiumTestDistribution/AppClaw/commit/294a780113d8afdb99b80cf57b47db5b3fe12dc2)) -* terminal view ([42c0e75](https://github.com/AppiumTestDistribution/AppClaw/commit/42c0e75e2d8a28c569b6511891628c1b98380cc3)) +- add semantic-release for automated versioning and npm publishing ([#19](https://github.com/AppiumTestDistribution/AppClaw/issues/19)) ([66c73a6](https://github.com/AppiumTestDistribution/AppClaw/commit/66c73a677e763112c4fab80dd29301f3d2071532)) +- ci ([#10](https://github.com/AppiumTestDistribution/AppClaw/issues/10)) ([dfcd62f](https://github.com/AppiumTestDistribution/AppClaw/commit/dfcd62fa083d673c98fc0c381820c7dd58d36818)) +- DOM locator resolution, vision assert parsing, and appium-mcp coordinate scaling ([9272c36](https://github.com/AppiumTestDistribution/AppClaw/commit/9272c36b65e7bd996b730bb6d67d0fa6fee9518a)) +- read CLI version from package.json instead of hardcoded string ([#14](https://github.com/AppiumTestDistribution/AppClaw/issues/14)) ([fcb3a64](https://github.com/AppiumTestDistribution/AppClaw/commit/fcb3a6417ddc48d72d246bc9fd5dd1438020635d)) +- screenshot parsing ([e449a23](https://github.com/AppiumTestDistribution/AppClaw/commit/e449a2341fc67e193f1519bae16d4cace878bcfc)) +- scroll-aware stuck detection, press_enter tool, and post-done verification ([c03bbe4](https://github.com/AppiumTestDistribution/AppClaw/commit/c03bbe4222ce7fd7bba6867f7d1e59ac5ef3c8ee)) +- terminal UI ([294a780](https://github.com/AppiumTestDistribution/AppClaw/commit/294a780113d8afdb99b80cf57b47db5b3fe12dc2)) +- terminal view ([42c0e75](https://github.com/AppiumTestDistribution/AppClaw/commit/42c0e75e2d8a28c569b6511891628c1b98380cc3)) diff --git a/action.yml b/action.yml index cd9716f..4ef1ced 100644 --- a/action.yml +++ b/action.yml @@ -52,6 +52,40 @@ inputs: required: false default: '500' + # ── Debug ──────────────────────────────────────────────────────────────────── + mcp-debug: + description: 'Enable MCP debug logging (MCP_DEBUG=1). Default: false' + required: false + default: 'false' + mcp-timeout-ms: + description: 'MCP request timeout in milliseconds. Default: 300000' + required: false + default: '300000' + llm-thinking: + description: 'Enable LLM extended thinking: on or off. Default: off' + required: false + default: 'off' + + # ── iOS device ─────────────────────────────────────────────────────────────── + ios-device-type: + description: 'iOS device type: simulator or real. Default: simulator' + required: false + default: 'simulator' + + # ── iOS simulator ──────────────────────────────────────────────────────────── + device-udid: + description: 'Explicit device/simulator UDID to target. Leave empty to let AppClaw auto-detect.' + required: false + default: '' + ios-simulator-name: + description: 'iOS simulator device model to boot (e.g. "iPhone 16", "iPhone 15 Pro"). Default: iPhone 16' + required: false + default: 'iPhone 16' + ios-simulator-os: + description: 'iOS version to use when multiple runtimes are available (e.g. "18.4", "17.5"). Default: latest available' + required: false + default: '' + # ── Android emulator ───────────────────────────────────────────────────────── android-api-level: description: 'Android emulator API level. Default: 33 (Android 13)' @@ -191,7 +225,7 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} @@ -212,7 +246,7 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} @@ -243,7 +277,7 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} @@ -265,7 +299,7 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} @@ -279,6 +313,88 @@ runs: disable-animations: true script: appclaw "${{ inputs.goal }}" --platform android + # ── iOS — pre-download WebDriverAgent ──────────────────────────────────── + - name: Download prebuilt WebDriverAgent for iOS simulator + if: inputs.platform == 'ios' && inputs.cloud-provider == '' + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: | + # Resolve latest WDA version via GitHub API (authenticated = 5000/hr, no rate-limit risk) + WDA_VERSION=$(curl -fsSL \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/appium/WebDriverAgent/releases/latest" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['tag_name'].lstrip('v'))") + + if [ -z "$WDA_VERSION" ]; then + echo "::error::Could not resolve latest WDA version from GitHub" + exit 1 + fi + + ARCH=$(uname -m) # arm64 on macos-14 (Apple Silicon), x86_64 otherwise + URL="https://github.com/appium/WebDriverAgent/releases/download/v${WDA_VERSION}/WebDriverAgentRunner-Build-Sim-${ARCH}.zip" + + echo "Downloading prebuilt WDA v${WDA_VERSION} for ${ARCH}..." + curl -fsSL "${URL}" -o /tmp/wda.zip + unzip -q /tmp/wda.zip -d /tmp/wda + + WDA_APP="/tmp/wda/WebDriverAgentRunner-Runner.app" + if [ ! -d "$WDA_APP" ]; then + echo "::error::WebDriverAgentRunner-Runner.app not found after extraction" + ls -la /tmp/wda/ + exit 1 + fi + + echo "APPIUM_MCP_WDA_APP_PATH=${WDA_APP}" >> $GITHUB_ENV + echo "WDA pre-downloaded: ${WDA_APP}" + + # ── iOS — boot simulator ───────────────────────────────────────────────── + - name: Boot iOS simulator + if: inputs.platform == 'ios' && inputs.cloud-provider == '' && inputs.ios-device-type == 'simulator' + shell: bash + env: + SIM_NAME: ${{ inputs.ios-simulator-name }} + SIM_OS: ${{ inputs.ios-simulator-os }} + run: | + xcrun simctl list devices available -j > /tmp/simctl_devices.json + + UDID=$(python3 <<'EOF' + import json, os, re, sys + sim_name = os.environ.get('SIM_NAME', 'iPhone 16').lower() + sim_os = os.environ.get('SIM_OS', '').strip() + data = json.load(open('/tmp/simctl_devices.json')) + candidates = [] + for runtime, devs in data['devices'].items(): + if 'iOS' not in runtime: + continue + # Extract version from runtime key, e.g. "com.apple.CoreSimulator.SimRuntime.iOS-18-4" → "18.4" + m = re.search(r'iOS[- ]([\d][\d.-]+)', runtime, re.IGNORECASE) + ver = m.group(1).replace('-', '.') if m else '' + if sim_os and not ver.startswith(sim_os): + continue + for d in devs: + if d.get('isAvailable') and sim_name in d.get('name', '').lower(): + candidates.append((ver, d['udid'])) + if not candidates: + sys.exit(1) + # Pick highest iOS version + candidates.sort(key=lambda x: [int(p) for p in x[0].split('.') if p.isdigit()], reverse=True) + print(candidates[0][1]) + EOF + ) + + if [ -z "$UDID" ]; then + echo "::error::No available iOS simulator matching name='${SIM_NAME}' os='${SIM_OS}'" + xcrun simctl list devices available + exit 1 + fi + + echo "Booting simulator $UDID (${SIM_NAME})" + xcrun simctl boot "$UDID" 2>/dev/null || true # already Booted is OK + xcrun simctl bootstatus "$UDID" -b # block until fully booted + echo "IOS_SIMULATOR_UDID=$UDID" >> "$GITHUB_ENV" + # ── iOS — YAML flow ─────────────────────────────────────────────────────── - name: Run YAML flow on iOS simulator if: inputs.platform == 'ios' && inputs.cloud-provider == '' && inputs.flow != '' @@ -287,12 +403,15 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} PLATFORM: ios - DEVICE_TYPE: simulator + DEVICE_TYPE: ${{ inputs.ios-device-type }} + DEVICE_UDID: ${{ inputs.device-udid || env.IOS_SIMULATOR_UDID }} + MCP_DEBUG: ${{ inputs.mcp-debug == 'true' && '1' || '0' }} + MCP_TIMEOUT_MS: ${{ inputs.mcp-timeout-ms }} run: appclaw --flow "${{ inputs.flow }}" --platform ios # ── iOS — natural language goal ─────────────────────────────────────────── @@ -303,12 +422,15 @@ runs: LLM_PROVIDER: ${{ inputs.provider }} LLM_API_KEY: ${{ inputs.api-key }} LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' + LLM_THINKING: ${{ inputs.llm-thinking }} AGENT_MODE: ${{ inputs.agent-mode }} MAX_STEPS: ${{ inputs.max-steps }} STEP_DELAY: ${{ inputs.step-delay }} PLATFORM: ios - DEVICE_TYPE: simulator + DEVICE_TYPE: ${{ inputs.ios-device-type }} + DEVICE_UDID: ${{ inputs.device-udid || env.IOS_SIMULATOR_UDID }} + MCP_DEBUG: ${{ inputs.mcp-debug == 'true' && '1' || '0' }} + MCP_TIMEOUT_MS: ${{ inputs.mcp-timeout-ms }} run: appclaw "${{ inputs.goal }}" --platform ios # ── Report ──────────────────────────────────────────────────────────────── diff --git a/github-action/README.md b/github-action/README.md index a87a74c..a49a32a 100644 --- a/github-action/README.md +++ b/github-action/README.md @@ -21,7 +21,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/login.yaml platform: android @@ -31,7 +31,7 @@ jobs: ### Android — run a natural language goal ```yaml -- uses: AppiumTestDistribution/AppClaw/github-action@v1 +- uses: AppiumTestDistribution/AppClaw@v1 with: goal: 'Open YouTube, search for Appium 3.0, verify the first result is visible' platform: android @@ -47,10 +47,12 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/ios-login.yaml platform: ios + ios-simulator-name: 'iPhone 16' # optional: defaults to iPhone 16 + ios-simulator-os: '18.4' # optional: defaults to latest api-key: ${{ secrets.LLM_API_KEY }} ``` @@ -72,6 +74,10 @@ jobs: | `android-api-level` | no | `33` | Android emulator API level (33 = Android 13) | | `android-profile` | no | `pixel_6` | Android AVD hardware profile | | `android-target` | no | `default` | Emulator target: `default` or `google_apis` | +| `ios-device-type` | no | `simulator` | iOS device type: `simulator` or `real` | +| `ios-simulator-name` | no | `iPhone 16` | iOS simulator model to boot (e.g. `iPhone 15`, `iPad Air`) | +| `ios-simulator-os` | no | _(latest)_ | iOS version filter for simulator selection (e.g. `18.4`) | +| `mcp-debug` | no | `false` | Enable MCP debug logging (`MCP_DEBUG=1`). Useful for diagnosing CI timeouts. | | `cloud-provider` | no | _(local)_ | Cloud device provider: `lambdatest`. Leave empty for local emulator/simulator. | | `lambdatest-username` | no² | — | LambdaTest account username | | `lambdatest-access-key` | no² | — | LambdaTest access key | @@ -143,7 +149,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: ${{ matrix.flow }} platform: android @@ -160,7 +166,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/ios-login.yaml platform: ios @@ -176,7 +182,7 @@ jobs: ### Pin model for cost control ```yaml -- uses: AppiumTestDistribution/AppClaw/github-action@v1 +- uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/smoke.yaml platform: android @@ -187,7 +193,7 @@ jobs: ### Pin AppClaw version ```yaml -- uses: AppiumTestDistribution/AppClaw/github-action@v1 +- uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/smoke.yaml platform: android @@ -198,7 +204,7 @@ jobs: ### Use report path in a downstream step ```yaml -- uses: AppiumTestDistribution/AppClaw/github-action@v1 +- uses: AppiumTestDistribution/AppClaw@v1 id: appclaw with: flow: flows/login.yaml @@ -212,7 +218,7 @@ jobs: ### Vision mode (screenshot-based AI) ```yaml -- uses: AppiumTestDistribution/AppClaw/github-action@v1 +- uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/onboarding.yaml platform: android @@ -232,7 +238,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/full-regression.yaml platform: android diff --git a/github-action/action.yml b/github-action/action.yml deleted file mode 100644 index cd9716f..0000000 --- a/github-action/action.yml +++ /dev/null @@ -1,332 +0,0 @@ -name: 'AppClaw Mobile Tests' -description: 'Run mobile UI automation flows and AI-driven goals in CI — Android emulator, iOS simulator, or LambdaTest cloud devices.' -author: 'AppiumTestDistribution' - -branding: - icon: 'smartphone' - color: 'purple' - -# ── Inputs ──────────────────────────────────────────────────────────────────── - -inputs: - # ── What to run ───────────────────────────────────────────────────────────── - flow: - description: 'Path to a YAML flow file (mutually exclusive with goal)' - required: false - default: '' - goal: - description: 'Natural language goal for the LLM agent (mutually exclusive with flow)' - required: false - default: '' - - # ── Platform ───────────────────────────────────────────────────────────────── - platform: - description: 'Target platform: android or ios' - required: false - default: 'android' - - # ── LLM ────────────────────────────────────────────────────────────────────── - provider: - description: 'LLM provider: gemini, anthropic, openai, groq' - required: false - default: 'gemini' - api-key: - description: 'LLM API key — passed to AppClaw as LLM_API_KEY' - required: true - model: - description: 'LLM model ID to use (e.g. gemini-2.0-flash, claude-3-5-haiku-20241022). Defaults to the provider built-in.' - required: false - default: '' - - # ── Agent ──────────────────────────────────────────────────────────────────── - agent-mode: - description: 'Interaction strategy: dom (element locators) or vision (screenshot AI)' - required: false - default: 'dom' - max-steps: - description: 'Maximum agent steps before the run is marked failed. Default: 30' - required: false - default: '30' - step-delay: - description: 'Delay in milliseconds between steps. Default: 500' - required: false - default: '500' - - # ── Android emulator ───────────────────────────────────────────────────────── - android-api-level: - description: 'Android emulator API level. Default: 33 (Android 13)' - required: false - default: '33' - android-profile: - description: 'Android AVD hardware profile. Default: pixel_6' - required: false - default: 'pixel_6' - android-target: - description: 'Emulator system image target: default or google_apis' - required: false - default: 'default' - android-arch: - description: 'Emulator CPU architecture: x86_64 or x86. Default: x86_64 (required for API 31+)' - required: false - default: 'x86_64' - - # ── LambdaTest cloud ───────────────────────────────────────────────────────── - cloud-provider: - description: 'Cloud device provider: lambdatest. Leave empty for local emulator/simulator (default).' - required: false - default: '' - lambdatest-username: - description: 'LambdaTest account username (required when cloud-provider=lambdatest)' - required: false - default: '' - lambdatest-access-key: - description: 'LambdaTest access key (required when cloud-provider=lambdatest)' - required: false - default: '' - lambdatest-device-name: - description: 'Cloud device name, e.g. "Pixel 7" or "iPhone 14" (required when cloud-provider=lambdatest)' - required: false - default: '' - lambdatest-os-version: - description: 'Cloud OS version, e.g. "13" for Android or "16" for iOS (required when cloud-provider=lambdatest)' - required: false - default: '' - lambdatest-app: - description: 'LambdaTest app ID (lt://APP...) — the app to test on the cloud device' - required: false - default: '' - - # ── Report ─────────────────────────────────────────────────────────────────── - report: - description: 'Upload HTML report as a workflow artifact after the run. Default: true' - required: false - default: 'true' - report-name: - description: 'Name of the uploaded artifact. Default: appclaw-report' - required: false - default: 'appclaw-report' - - # ── AppClaw version ─────────────────────────────────────────────────────────── - appclaw-version: - description: 'AppClaw npm package version to install. Default: latest' - required: false - default: 'latest' - use-local-build: - description: 'Build and install AppClaw from the checked-out source instead of npm. Use in PRs to test local changes.' - required: false - default: 'false' - -# ── Outputs ─────────────────────────────────────────────────────────────────── - -outputs: - report-path: - description: 'Path to the generated .appclaw/runs// report directory' - value: ${{ steps.report-path.outputs.path }} - -# ── Steps ───────────────────────────────────────────────────────────────────── - -runs: - using: composite - steps: - # ── Validate ────────────────────────────────────────────────────────────── - - name: Validate inputs - shell: bash - run: | - if [ -z "${{ inputs.flow }}" ] && [ -z "${{ inputs.goal }}" ]; then - echo "::error title=Missing input::Provide either 'flow' (path to YAML) or 'goal' (natural language string)" - exit 1 - fi - if [ -n "${{ inputs.flow }}" ] && [ -n "${{ inputs.goal }}" ]; then - echo "::error title=Conflicting inputs::Provide either 'flow' or 'goal', not both" - exit 1 - fi - if [ "${{ inputs.platform }}" != "android" ] && [ "${{ inputs.platform }}" != "ios" ]; then - echo "::error title=Invalid platform::platform must be 'android' or 'ios', got '${{ inputs.platform }}'" - exit 1 - fi - if [ -n "${{ inputs.cloud-provider }}" ] && [ "${{ inputs.cloud-provider }}" != "lambdatest" ]; then - echo "::error title=Invalid cloud-provider::cloud-provider must be 'lambdatest' or empty, got '${{ inputs.cloud-provider }}'" - exit 1 - fi - if [ "${{ inputs.cloud-provider }}" = "lambdatest" ]; then - if [ -z "${{ inputs.lambdatest-username }}" ] || [ -z "${{ inputs.lambdatest-access-key }}" ]; then - echo "::error title=Missing LambdaTest credentials::lambdatest-username and lambdatest-access-key are required when cloud-provider=lambdatest" - exit 1 - fi - if [ -z "${{ inputs.lambdatest-device-name }}" ] || [ -z "${{ inputs.lambdatest-os-version }}" ]; then - echo "::error title=Missing device info::lambdatest-device-name and lambdatest-os-version are required when cloud-provider=lambdatest" - exit 1 - fi - fi - - # ── Node + AppClaw ──────────────────────────────────────────────────────── - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '22' - - - name: Install AppClaw (from npm) - if: inputs.use-local-build == 'false' - shell: bash - run: | - echo "::group::Installing appclaw@${{ inputs.appclaw-version }}" - npm install -g appclaw@${{ inputs.appclaw-version }} mjpeg-consumer - echo "::endgroup::" - - - name: Install AppClaw (from local source) - if: inputs.use-local-build == 'true' - shell: bash - run: | - echo "::group::Building and installing AppClaw from local source" - npm install --no-package-lock - npm run build - npm install -g . mjpeg-consumer - echo "::endgroup::" - - # ── LambdaTest — YAML flow ──────────────────────────────────────────────── - - name: Run YAML flow on LambdaTest - if: inputs.cloud-provider == 'lambdatest' && inputs.flow != '' - shell: bash - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: ${{ inputs.platform }} - CLOUD_PROVIDER: lambdatest - LAMBDATEST_USERNAME: ${{ inputs.lambdatest-username }} - LAMBDATEST_ACCESS_KEY: ${{ inputs.lambdatest-access-key }} - LAMBDATEST_DEVICE_NAME: ${{ inputs.lambdatest-device-name }} - LAMBDATEST_OS_VERSION: ${{ inputs.lambdatest-os-version }} - LAMBDATEST_APP: ${{ inputs.lambdatest-app }} - run: appclaw --flow "${{ inputs.flow }}" --platform ${{ inputs.platform }} - - # ── LambdaTest — natural language goal ──────────────────────────────────── - - name: Run goal on LambdaTest - if: inputs.cloud-provider == 'lambdatest' && inputs.goal != '' - shell: bash - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: ${{ inputs.platform }} - CLOUD_PROVIDER: lambdatest - LAMBDATEST_USERNAME: ${{ inputs.lambdatest-username }} - LAMBDATEST_ACCESS_KEY: ${{ inputs.lambdatest-access-key }} - LAMBDATEST_DEVICE_NAME: ${{ inputs.lambdatest-device-name }} - LAMBDATEST_OS_VERSION: ${{ inputs.lambdatest-os-version }} - LAMBDATEST_APP: ${{ inputs.lambdatest-app }} - run: appclaw "${{ inputs.goal }}" --platform ${{ inputs.platform }} - - # ── Android — enable KVM ────────────────────────────────────────────────── - - name: Enable KVM - if: inputs.platform == 'android' && inputs.cloud-provider == '' - shell: bash - run: | - echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \ - | sudo tee /etc/udev/rules.d/99-kvm4all.rules - sudo udevadm control --reload-rules - sudo udevadm trigger --name-match=kvm - - # ── Android — YAML flow ─────────────────────────────────────────────────── - - name: Run YAML flow on Android emulator - if: inputs.platform == 'android' && inputs.cloud-provider == '' && inputs.flow != '' - uses: reactivecircus/android-emulator-runner@v2 - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: android - with: - api-level: ${{ inputs.android-api-level }} - arch: ${{ inputs.android-arch }} - profile: ${{ inputs.android-profile }} - target: ${{ inputs.android-target }} - emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim - disable-animations: true - script: appclaw --flow "${{ inputs.flow }}" --platform android - - # ── Android — natural language goal ─────────────────────────────────────── - - name: Run goal on Android emulator - if: inputs.platform == 'android' && inputs.cloud-provider == '' && inputs.goal != '' - uses: reactivecircus/android-emulator-runner@v2 - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: android - with: - api-level: ${{ inputs.android-api-level }} - arch: ${{ inputs.android-arch }} - profile: ${{ inputs.android-profile }} - target: ${{ inputs.android-target }} - emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim - disable-animations: true - script: appclaw "${{ inputs.goal }}" --platform android - - # ── iOS — YAML flow ─────────────────────────────────────────────────────── - - name: Run YAML flow on iOS simulator - if: inputs.platform == 'ios' && inputs.cloud-provider == '' && inputs.flow != '' - shell: bash - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: ios - DEVICE_TYPE: simulator - run: appclaw --flow "${{ inputs.flow }}" --platform ios - - # ── iOS — natural language goal ─────────────────────────────────────────── - - name: Run goal on iOS simulator - if: inputs.platform == 'ios' && inputs.cloud-provider == '' && inputs.goal != '' - shell: bash - env: - LLM_PROVIDER: ${{ inputs.provider }} - LLM_API_KEY: ${{ inputs.api-key }} - LLM_MODEL: ${{ inputs.model }} - LLM_THINKING: 'off' - AGENT_MODE: ${{ inputs.agent-mode }} - MAX_STEPS: ${{ inputs.max-steps }} - STEP_DELAY: ${{ inputs.step-delay }} - PLATFORM: ios - DEVICE_TYPE: simulator - run: appclaw "${{ inputs.goal }}" --platform ios - - # ── Report ──────────────────────────────────────────────────────────────── - - name: Find report path - id: report-path - if: always() - shell: bash - run: | - DIR=$(ls -td .appclaw/runs/*/ 2>/dev/null | head -1 || echo "") - echo "path=${DIR}" >> $GITHUB_OUTPUT - if [ -n "$DIR" ]; then - echo "::notice title=AppClaw Report::Report written to ${DIR}" - fi - - - name: Upload report artifact - if: ${{ always() && inputs.report == 'true' }} - uses: actions/upload-artifact@v4 - with: - name: ${{ inputs.report-name }} - path: .appclaw/runs/ - if-no-files-found: warn diff --git a/github-action/examples/android-flow.yml b/github-action/examples/android-flow.yml index 7d7bcd4..59f5812 100644 --- a/github-action/examples/android-flow.yml +++ b/github-action/examples/android-flow.yml @@ -17,7 +17,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/youtube.yaml platform: android diff --git a/github-action/examples/android-goal.yml b/github-action/examples/android-goal.yml index 524ac84..8ced70c 100644 --- a/github-action/examples/android-goal.yml +++ b/github-action/examples/android-goal.yml @@ -17,7 +17,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: goal: 'Open YouTube, search for Appium 3.0, tap the first result, scroll down, verify a video by TestMu AI is visible' platform: android diff --git a/github-action/examples/full-pipeline.yml b/github-action/examples/full-pipeline.yml index 2b5ab4c..c5bdc25 100644 --- a/github-action/examples/full-pipeline.yml +++ b/github-action/examples/full-pipeline.yml @@ -33,7 +33,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 id: smoke with: flow: flows/youtube.yaml @@ -61,7 +61,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: ${{ matrix.flow }} platform: android @@ -77,7 +77,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/ios-smoke.yaml platform: ios diff --git a/github-action/examples/ios-flow.yml b/github-action/examples/ios-flow.yml index d594020..0015c39 100644 --- a/github-action/examples/ios-flow.yml +++ b/github-action/examples/ios-flow.yml @@ -17,9 +17,11 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/ios-smoke.yaml platform: ios + ios-simulator-name: 'iPhone 16' # optional: defaults to iPhone 16 + ios-simulator-os: '18.4' # optional: defaults to latest provider: gemini api-key: ${{ secrets.LLM_API_KEY }} diff --git a/github-action/examples/lambdatest-cloud.yml b/github-action/examples/lambdatest-cloud.yml index 7ba949f..d6ffd02 100644 --- a/github-action/examples/lambdatest-cloud.yml +++ b/github-action/examples/lambdatest-cloud.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/youtube.yaml platform: android @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: flows/ios-smoke.yaml platform: ios diff --git a/github-action/examples/matrix-parallel.yml b/github-action/examples/matrix-parallel.yml index 7254aeb..3fc4ad5 100644 --- a/github-action/examples/matrix-parallel.yml +++ b/github-action/examples/matrix-parallel.yml @@ -21,7 +21,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: AppiumTestDistribution/AppClaw/github-action@v1 + - uses: AppiumTestDistribution/AppClaw@v1 with: flow: ${{ matrix.flow }} platform: android diff --git a/landing/usage.html b/landing/usage.html index bc86989..5e9d79e 100644 --- a/landing/usage.html +++ b/landing/usage.html @@ -2964,7 +2964,9 @@

GitHub Actions

Available on the - GitHub Marketplace + GitHub Marketplace as AppClaw Mobile Tests.

@@ -3053,27 +3055,164 @@

Inputs

- flowone of*—Path to a YAML flow file relative to repo root - goalone of*—Natural language goal executed by the LLM agent - platformnoandroidTarget platform: android or ios - providernogeminiLLM provider: gemini, anthropic, openai, groq - api-keyyes—LLM API key — stored as LLM_API_KEY - modelnoprovider defaultLLM model ID to pin (e.g. gemini-2.0-flash) - agent-modenodomdom (element locators) or vision (screenshot AI) - max-stepsno30Maximum agent steps before the run fails - step-delayno500Milliseconds between steps - android-api-levelno33Android emulator API level (33 = Android 13) - android-profilenopixel_6Android AVD hardware profile - android-targetnodefaultEmulator target: default or google_apis - cloud-providernolocalCloud provider: lambdatest. Leave empty for local. - lambdatest-usernameno**—LambdaTest account username - lambdatest-access-keyno**—LambdaTest access key - lambdatest-device-nameno**—Cloud device name (e.g. Pixel 7) - lambdatest-os-versionno**—Cloud OS version (e.g. 13, 16) - lambdatest-appno—LambdaTest app ID (lt://APP...) - reportnotrueUpload HTML report as workflow artifact - report-namenoappclaw-reportName of the uploaded artifact - appclaw-versionnolatestnpm package version to pin + + flow + one of* + — + Path to a YAML flow file relative to repo root + + + goal + one of* + — + Natural language goal executed by the LLM agent + + + platform + no + android + Target platform: android or ios + + + provider + no + gemini + + LLM provider: gemini, anthropic, openai, + groq + + + + api-key + yes + — + LLM API key — stored as LLM_API_KEY + + + model + no + provider default + LLM model ID to pin (e.g. gemini-2.0-flash) + + + agent-mode + no + dom + dom (element locators) or vision (screenshot AI) + + + max-steps + no + 30 + Maximum agent steps before the run fails + + + step-delay + no + 500 + Milliseconds between steps + + + android-api-level + no + 33 + Android emulator API level (33 = Android 13) + + + android-profile + no + pixel_6 + Android AVD hardware profile + + + android-target + no + default + Emulator target: default or google_apis + + + ios-device-type + no + simulator + iOS device type: simulator or real + + + ios-simulator-name + no + iPhone 16 + + iOS simulator model to boot (e.g. iPhone 15, iPad Air) + + + + ios-simulator-os + no + latest + iOS version filter for simulator selection (e.g. 18.4) + + + mcp-debug + no + false + + Enable MCP debug logging (MCP_DEBUG=1). Useful for diagnosing CI + timeouts. + + + + cloud-provider + no + local + Cloud provider: lambdatest. Leave empty for local. + + + lambdatest-username + no** + — + LambdaTest account username + + + lambdatest-access-key + no** + — + LambdaTest access key + + + lambdatest-device-name + no** + — + Cloud device name (e.g. Pixel 7) + + + lambdatest-os-version + no** + — + Cloud OS version (e.g. 13, 16) + + + lambdatest-app + no + — + LambdaTest app ID (lt://APP...) + + + report + no + true + Upload HTML report as workflow artifact + + + report-name + no + appclaw-report + Name of the uploaded artifact + + + appclaw-version + no + latest + npm package version to pin +

* Provide either flow or goal, not both.

@@ -3083,17 +3222,36 @@

Inputs

Secrets Setup

- Go to your repo → Settings → Secrets and variables → Actions → New repository secret: + Go to your repo → + Settings → Secrets and variables → Actions → New repository + secret:

- + + + + - - - - + + + + + + + + + + + + + + + +
Secret nameDescription
Secret nameDescription
LLM_API_KEYYour API key — works for any provider (Gemini, Anthropic, OpenAI, Groq)
LT_USERNAMELambdaTest username (only if using cloud devices)
LT_ACCESS_KEYLambdaTest access key (only if using cloud devices)
LT_APP_IDLambdaTest app ID (only if using cloud devices)
LLM_API_KEYYour API key — works for any provider (Gemini, Anthropic, OpenAI, Groq)
LT_USERNAMELambdaTest username (only if using cloud devices)
LT_ACCESS_KEYLambdaTest access key (only if using cloud devices)
LT_APP_IDLambdaTest app ID (only if using cloud devices)
@@ -3209,8 +3367,9 @@

Nightly regression on a schedule

Reports

- When report: true (default), an HTML report is uploaded as a workflow artifact after each run. - Download it from the Actions run summary → Artifacts. The report includes: + When report: true (default), an HTML report is uploaded as a workflow + artifact after each run. Download it from the + Actions run summary → Artifacts. The report includes:

diff --git a/package.json b/package.json index 1b6b2fb..a5e283a 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "deploy:landing": "npm run deploy --prefix landing" }, "dependencies": { + "appium-mcp": "^1.67.0", "@ai-sdk/anthropic": "^1.0.0", "@ai-sdk/google": "^3.0.43", "@ai-sdk/openai": "^1.0.0", diff --git a/src/agent/app-resolver.ts b/src/agent/app-resolver.ts index 363d412..df0b227 100644 --- a/src/agent/app-resolver.ts +++ b/src/agent/app-resolver.ts @@ -126,7 +126,7 @@ export class AppResolver { } try { - const result = await mcp.callTool('appium_list_apps', {}); + const result = await mcp.callTool('appium_app_lifecycle', { action: 'list' }); const text = result.content?.map((c: any) => c.text ?? '').join('\n') ?? ''; this.apps = parseAppList(text); diff --git a/src/agent/element-finder.ts b/src/agent/element-finder.ts index e0a8479..96d41a2 100644 --- a/src/agent/element-finder.ts +++ b/src/agent/element-finder.ts @@ -155,26 +155,24 @@ export async function findElementWithFallback( * Works without finding an element — taps at the exact x,y position. */ export async function tapAtCoordinates(mcp: MCPClient, x: number, y: number): Promise { - // Preferred: appium-mcp's built-in tap by coordinates tool + const ix = Math.round(x); + const iy = Math.round(y); + const mcpDebug = process.env.MCP_DEBUG === '1' || process.env.MCP_DEBUG === 'true'; + + // Preferred: appium_gesture tap at coordinates (appium-mcp 1.61+) try { - const result = await mcp.callTool('appium_tap_by_coordinates', { x, y }); + const result = await mcp.callTool('appium_gesture', { action: 'tap', x: ix, y: iy }); const text = result.content?.map((c: any) => (c.type === 'text' ? c.text : '')).join('') ?? ''; + if (mcpDebug) + console.log(` tapAtCoordinates(${ix},${iy}) gesture response: ${text.slice(0, 200)}`); if (!text.toLowerCase().includes('error') && !text.toLowerCase().includes('failed')) { return true; } - } catch { - /* not supported or failed */ - } - - // Android: mobile: clickGesture - try { - await mcp.callTool('appium_execute_script', { - script: 'mobile: clickGesture', - args: [{ x, y }], - }); - return true; - } catch { - /* not supported or failed */ + } catch (err) { + if (mcpDebug) + console.log( + ` tapAtCoordinates gesture error: ${err instanceof Error ? err.message : err}` + ); } // W3C Actions pointer tap @@ -186,7 +184,7 @@ export async function tapAtCoordinates(mcp: MCPClient, x: number, y: number): Pr id: 'finger1', parameters: { pointerType: 'touch' }, actions: [ - { type: 'pointerMove', duration: 0, x, y }, + { type: 'pointerMove', duration: 0, x: ix, y: iy }, { type: 'pointerDown', button: 0 }, { type: 'pause', duration: 100 }, { type: 'pointerUp', button: 0 }, @@ -195,8 +193,11 @@ export async function tapAtCoordinates(mcp: MCPClient, x: number, y: number): Pr ], }); return true; - } catch { - /* not supported or failed */ + } catch (err) { + if (mcpDebug) + console.log( + ` tapAtCoordinates w3c error: ${err instanceof Error ? err.message : err}` + ); } return false; diff --git a/src/agent/loop.ts b/src/agent/loop.ts index ac19880..dd0e7da 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -24,7 +24,7 @@ import { tapAtCoordinates, isAIElement, parseAIElementCoords } from './element-f import { findElementByVision } from '../mcp/tools.js'; import { Config } from '../config.js'; import { isVisionLocateEnabled } from '../vision/locate-enabled.js'; -import { getCachedScreenSize } from '../vision/window-size.js'; +import { getCachedScreenSize, getScreenSizeForStark } from '../vision/window-size.js'; import type { ActionRecorder } from '../recording/recorder.js'; import type { AppResolver } from './app-resolver.js'; import { preprocessAction, resolveAppId } from './preprocessor.js'; @@ -39,6 +39,7 @@ import { extractGoalKeywords, extractAppIdFromText, } from '../memory/fingerprint.js'; +import { loadAppGuide } from '../appguides/index.js'; const mcpDebug = process.env.MCP_DEBUG === '1' || process.env.MCP_DEBUG === 'true'; @@ -133,6 +134,8 @@ export async function runAgent(options: AgentOptions): Promise { let lastResult = ''; let detectedPlatform: 'android' | 'ios' = 'android'; let postActionScreenshot: string | undefined; // Screenshot captured after previous action + let lastAppGuideId = ''; // Track last app a guide was logged for (avoid duplicate logs) + let activeAppId = options.appId ?? ''; // Current foreground app — drives AppGuide loading let cachedPostScreen: import('../perception/types.js').ScreenState | undefined; // Reuse post-action screen as next step's perception const triedSelectors: string[] = []; // Track selectors the LLM has tried (for stuck recovery) @@ -146,8 +149,6 @@ export async function runAgent(options: AgentOptions): Promise { // Detect device UDID for keyboard input (ADB-based typing on Android) const deviceUdid = await detectDeviceUdid(); - const agentSpinDetail = ui.formatAgentThinkingDetail(modelName); - // ── Episodic Memory ────────────────────────────────── // Cross-session trajectory store: remembers winning actions from previous runs. const episodicEnabled = Config.EPISODIC_MEMORY === 'on'; @@ -162,7 +163,7 @@ export async function runAgent(options: AgentOptions): Promise { const episodicStore = episodicEnabled ? loadStore(episodicStorePath) : undefined; const goalKeywords = episodicEnabled ? extractGoalKeywords(goal) : []; - if (episodicEnabled) { + if (episodicEnabled && mcpDebug) { const entryCount = episodicStore?.entries.length ?? 0; ui.printAgentBullet(`Episodic memory: ON (${entryCount} stored trajectories)`); } @@ -176,6 +177,10 @@ export async function runAgent(options: AgentOptions): Promise { if (preResult.handled) { ui.printPreprocessor(preResult.message ?? ''); lastResult = preResult.message ?? ''; + // Track launched app for AppGuide (independent of episodic memory) + if (preResult.appId) { + activeAppId = preResult.appId; + } // Feed preprocessor result to episodic recorder for app ID detection if (episodicRecorder && lastResult) { const appIdFromResult = extractAppIdFromText(lastResult); @@ -189,11 +194,12 @@ export async function runAgent(options: AgentOptions): Promise { } for (let step = 0; step < maxSteps; step++) { - if (step === 0) { + if (step === 0 && mcpDebug) { ui.printAgentBullet('Pulling UI state from the device'); ui.printAgentBullet('Consulting the agent model for the next action'); } - ui.startSpinner('Reasoning…', agentSpinDetail); + const agentSpinDetail = ui.formatAgentThinkingDetail(modelName, step + 1, maxSteps); + ui.startSpinner('Reasoning…', agentSpinDetail, true); // ─── 1. PERCEIVE ───────────────────────────────────── const captureScreenshot = @@ -296,12 +302,12 @@ export async function runAgent(options: AgentOptions): Promise { ui.printWarning( `Rejected adaptation: "${adapted.slice(0, 80)}" — keeping original goal` ); - ui.startSpinner('Reasoning…', agentSpinDetail); + ui.startSpinner('Reasoning…', agentSpinDetail, true); } else { ui.stopSpinner(); ui.printInfo(`Goal adapted: ${adapted}`); goal = adapted; - ui.startSpinner('Reasoning…', agentSpinDetail); + ui.startSpinner('Reasoning…', agentSpinDetail, true); } } } @@ -347,7 +353,7 @@ export async function runAgent(options: AgentOptions): Promise { stuckHint += `\n\n${rollbackResult.message}`; stuck.reset(); } - ui.startSpinner('Reasoning…', agentSpinDetail); + ui.startSpinner('Reasoning…', agentSpinDetail, true); } // ─── 4. REASON (LLM call) ──────────────────────────── @@ -388,9 +394,30 @@ export async function runAgent(options: AgentOptions): Promise { if (matches.length > 0) { pastExperience = formatExperienceForPrompt(matches); episodicRecorder.trackInjectedTrajectories(matches); - ui.printAgentBullet( - `Episodic memory: injecting ${matches.length} past experience(s) (score: ${matches[0].score.toFixed(2)})` - ); + if (mcpDebug) { + ui.printAgentBullet( + `Episodic memory: injecting ${matches.length} past experience(s) (score: ${matches[0].score.toFixed(2)})` + ); + } + } + } + + // ── AppGuide: per-app navigation knowledge ──────────── + // activeAppId is set by the preprocessor or launch_app meta-tool (independent of episodic memory) + // Also sync from episodic recorder if it detected a new app via DOM + if (episodicRecorder?.currentAppId) activeAppId = episodicRecorder.currentAppId; + const appGuide = loadAppGuide(activeAppId); + if (appGuide) { + if (activeAppId !== lastAppGuideId) { + lastAppGuideId = activeAppId; + if (mcpDebug) { + const firstLine = appGuide.split('\n')[0]; + ui.printAgentBullet( + `AppGuide: injecting ${firstLine.replace('APP_GUIDE ', '').replace(':', '').trim()}` + ); + } + } else if (mcpDebug) { + ui.printAgentBullet(`AppGuide: active (${activeAppId})`); } } @@ -408,25 +435,31 @@ export async function runAgent(options: AgentOptions): Promise { editableCount: screen.editableCount, failedOnScreen, pastExperience, + appGuide, }; let decision: ToolCallDecision; let streamingStarted = false; const llmT0 = performance.now(); try { - decision = await llm.getDecision(context, { - onTextStart() { - streamingStarted = true; - ui.stopSpinner(); - ui.startStreaming('Reasoning'); - }, - onTextChunk(text) { - ui.streamChunk(text); - }, - onDone() { - ui.stopStreaming(); - }, - }); + decision = await llm.getDecision( + context, + mcpDebug + ? { + onTextStart() { + streamingStarted = true; + ui.stopSpinner(); + ui.startStreaming('Reasoning'); + }, + onTextChunk(text) { + ui.streamChunk(text); + }, + onDone() { + ui.stopStreaming(); + }, + } + : {} + ); } catch (err: any) { const errName = err?.name ?? ''; const errMsg = err?.message ?? ''; @@ -466,8 +499,8 @@ export async function runAgent(options: AgentOptions): Promise { ); } - // If reasoning text is available but wasn't streamed live, show it now - if (decision.reasoning && !streamingStarted) { + // If reasoning text is available but wasn't streamed live, show it now (debug only) + if (mcpDebug && decision.reasoning && !streamingStarted) { ui.printReasoning(decision.reasoning); } @@ -625,7 +658,8 @@ export async function runAgent(options: AgentOptions): Promise { appResolver, deviceUdid, detectedPlatform, - screenshotForLLM + screenshotForLLM, + episodicRecorder ); } else { // Forward directly to MCP — appium tools, skills, everything @@ -634,6 +668,12 @@ export async function runAgent(options: AgentOptions): Promise { lastResult = `${decision.toolName} → ${result.success ? 'OK' : 'FAILED'}: ${result.message}`; + // ── Track launched app for AppGuide ────────────────── + if (decision.toolName === 'launch_app' && result.success) { + const launchedId = (decision.args.appId as string) ?? ''; + if (launchedId) activeAppId = launchedId; + } + // ── Record failure in negative cache ────────────────── // Only track failures with a selector — these are the ones the LLM // would otherwise retry. Keyed by screen hash so failures are @@ -800,6 +840,7 @@ export async function runAgent(options: AgentOptions): Promise { const META_TOOLS = new Set([ 'find_and_click', 'find_and_type', + 'find_and_long_press', 'launch_app', 'go_back', 'go_home', @@ -819,7 +860,8 @@ async function executeMetaTool( deviceUdid?: string | null, platform: 'android' | 'ios' = 'android', /** Reusable screenshot from the current step (avoids redundant capture in vision locate) */ - currentScreenshot?: string + currentScreenshot?: string, + episodicRecorder?: EpisodicRecorder ): Promise { /** * Scale LLM-provided 0-1000 normalized coordinates to device space. @@ -829,18 +871,17 @@ async function executeMetaTool( * Note: df-vision convention is [y, x] order for coordinates. */ async function scaleLLMCoords(tapX: number, tapY: number): Promise<{ x: number; y: number }> { - const deviceSize = getCachedScreenSize(mcp); - if (!deviceSize) { - // Fallback: no device size, can't scale — return as-is (will likely miss) - return { x: Math.round(tapX), y: Math.round(tapY) }; - } try { + // getScreenSizeForStark fetches from Appium if cache is empty — never silently skips scaling + const deviceSize = await getScreenSizeForStark(mcp, currentScreenshot ?? ''); const starkVision = (await import('df-vision')).default; // scaleCoordinates expects [y, x] in 0-1000 normalized space const bbox = starkVision.scaleCoordinates([tapY, tapX] as [number, number], deviceSize); return { x: Math.round(bbox.center.x), y: Math.round(bbox.center.y) }; } catch { - // df-vision unavailable — simple fallback + // df-vision unavailable — simple proportional fallback using cached size + const deviceSize = getCachedScreenSize(mcp); + if (!deviceSize) return { x: Math.round(tapX), y: Math.round(tapY) }; return { x: Math.round((tapX / 1000) * deviceSize.width), y: Math.round((tapY / 1000) * deviceSize.height), @@ -897,7 +938,10 @@ async function executeMetaTool( const visionUuid = await findElementByVision(mcp, selector, currentScreenshot); // Pass the UUID (ai-element: or standard) directly to appium_click // appium-mcp handles ai-element: UUIDs natively with coordinate tapping - const clickResult = await mcp.callTool('appium_click', { elementUUID: visionUuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: visionUuid, + }); if (!isMCPError(clickResult)) { const coords = parseAIElementCoords(visionUuid); const coordInfo = coords ? ` at [${coords.x},${coords.y}]` : ''; @@ -942,7 +986,10 @@ async function executeMetaTool( // Strategy 1: Use the LLM's chosen strategy try { const uuid = await findElement(mcp, strategy as any, selector); - const clickResult = await mcp.callTool('appium_click', { elementUUID: uuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: uuid, + }); if (!isMCPError(clickResult)) { return { success: true, message: `Clicked "${selector.slice(0, 60)}" via ${strategy}` }; } @@ -963,7 +1010,10 @@ async function executeMetaTool( for (const fb of fallbackStrategies) { try { const uuid = await findElement(mcp, fb.s as any, fb.v); - const clickResult = await mcp.callTool('appium_click', { elementUUID: uuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: uuid, + }); if (!isMCPError(clickResult)) { return { success: true, @@ -980,7 +1030,10 @@ async function executeMetaTool( if (isVisionLocateEnabled()) { try { const visionUuid = await findElementByVision(mcp, selector, currentScreenshot); - const clickResult = await mcp.callTool('appium_click', { elementUUID: visionUuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: visionUuid, + }); if (!isMCPError(clickResult)) { const coords = parseAIElementCoords(visionUuid); const coordInfo = coords ? ` at [${coords.x},${coords.y}]` : ''; @@ -1018,6 +1071,191 @@ async function executeMetaTool( }; } + case 'find_and_long_press': { + const isVisionModeLongPress = Config.AGENT_MODE === 'vision'; + const lpSelector = args.selector as string; + const lpBounds = args.bounds as string | undefined; + const lpTapX = args.tapX as number | undefined; + const lpTapY = args.tapY as number | undefined; + const lpDuration = (args.duration as number | undefined) ?? 2000; + const lpAttempts: string[] = []; + + /** + * Long-press at absolute device coordinates via appium_gesture (appium-mcp 1.61+). + * appium_gesture action=long_press accepts x/y directly without needing an element UUID. + */ + async function longPressAtCoords( + x: number, + y: number + ): Promise<{ success: boolean; text: string }> { + const result = await mcp.callTool('appium_gesture', { + action: 'long_press', + x, + y, + duration: lpDuration, + }); + const text = + result.content + ?.map((c: any) => (c.type === 'text' ? c.text : '')) + .filter(Boolean) + .join(' ') ?? ''; + return { success: !isMCPError(result), text }; + } + + if (isVisionModeLongPress) { + // ══ VISION MODE: locate via AI vision, then long-press at coordinates ══ + + // Fast path: LLM provided 0-1000 normalized coordinates + if (lpTapX != null && lpTapY != null) { + const scaled = await scaleLLMCoords(lpTapX, lpTapY); + const { success } = await longPressAtCoords(scaled.x, scaled.y); + if (success) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" via LLM coordinates at [${scaled.x},${scaled.y}]`, + }; + } + lpAttempts.push(`llm_coords [${scaled.x},${scaled.y}]: long-press failed`); + } + + // Vision locate fallback + if (isVisionLocateEnabled()) { + try { + const visionUuid = await findElementByVision(mcp, lpSelector, currentScreenshot); + const coords = parseAIElementCoords(visionUuid); + if (coords) { + const { success } = await longPressAtCoords(coords.x, coords.y); + if (success) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" via AI vision at [${coords.x},${coords.y}]`, + }; + } + lpAttempts.push(`ai_vision: long-press failed at [${coords.x},${coords.y}]`); + } else { + lpAttempts.push('ai_vision: could not parse coordinates from UUID'); + } + } catch (err) { + lpAttempts.push( + `ai_vision: ${err instanceof Error ? err.message.slice(0, 60) : 'not found'}` + ); + } + } + + // Bounds coordinate fallback + if (lpBounds) { + const coordMatch = lpBounds.match(/\[(\d+),(\d+)\]\[(\d+),(\d+)\]/); + if (coordMatch) { + const cx = Math.round((parseInt(coordMatch[1]) + parseInt(coordMatch[3])) / 2); + const cy = Math.round((parseInt(coordMatch[2]) + parseInt(coordMatch[4])) / 2); + const { success } = await longPressAtCoords(cx, cy); + if (success) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" at coordinates [${cx},${cy}]`, + }; + } + lpAttempts.push('coordinates: long-press failed'); + } + } + + return { + success: false, + message: `Long-press failed for "${lpSelector.slice(0, 60)}": ${lpAttempts.join(', ')}`, + }; + } + + // ══ DOM MODE: find element UUID, then long-press ══ + const lpStrategy = args.strategy as string; + const lpDomAttempts: string[] = []; + + // Try the LLM's chosen strategy + try { + const uuid = await findElement(mcp, lpStrategy as any, lpSelector); + const lpResult = await mcp.callTool('appium_gesture', { + action: 'long_press', + elementUUID: uuid, + duration: lpDuration, + }); + if (!isMCPError(lpResult)) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" via ${lpStrategy}`, + }; + } + lpDomAttempts.push(`${lpStrategy}: long-press failed`); + } catch { + lpDomAttempts.push(`${lpStrategy}: not found`); + } + + // Try alternate strategies + const lpFallbackStrategies: Array<{ s: string; v: string }> = []; + if (lpStrategy !== 'accessibility id') + lpFallbackStrategies.push({ s: 'accessibility id', v: lpSelector }); + if (lpStrategy !== 'id') lpFallbackStrategies.push({ s: 'id', v: lpSelector }); + + for (const fb of lpFallbackStrategies) { + try { + const uuid = await findElement(mcp, fb.s as any, lpSelector); + const lpResult = await mcp.callTool('appium_long_press', { + elementUUID: uuid, + duration: lpDuration, + }); + if (!isMCPError(lpResult)) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" via fallback ${fb.s}`, + }; + } + lpDomAttempts.push(`${fb.s}: long-press failed`); + } catch { + lpDomAttempts.push(`${fb.s}: not found`); + } + } + + // Vision fallback — extract coords and use coordinate-based long press + if (isVisionLocateEnabled()) { + try { + const visionUuid = await findElementByVision(mcp, lpSelector, currentScreenshot); + const coords = parseAIElementCoords(visionUuid); + if (coords) { + const { success } = await longPressAtCoords(coords.x, coords.y); + if (success) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" via AI vision at [${coords.x},${coords.y}]`, + }; + } + lpDomAttempts.push('ai_vision: long-press failed'); + } + } catch { + lpDomAttempts.push('ai_vision: not found'); + } + } + + // Bounds coordinate fallback + if (lpBounds) { + const coordMatch = lpBounds.match(/\[(\d+),(\d+)\]\[(\d+),(\d+)\]/); + if (coordMatch) { + const cx = Math.round((parseInt(coordMatch[1]) + parseInt(coordMatch[3])) / 2); + const cy = Math.round((parseInt(coordMatch[2]) + parseInt(coordMatch[4])) / 2); + const { success } = await longPressAtCoords(cx, cy); + if (success) { + return { + success: true, + message: `Long-pressed "${lpSelector.slice(0, 60)}" at coordinates [${cx},${cy}]`, + }; + } + lpAttempts.push('coordinates: long-press failed'); + } + } + + return { + success: false, + message: `All strategies failed for long-press "${lpSelector.slice(0, 60)}": ${lpDomAttempts.join(', ')}`, + }; + } + case 'find_and_type': { const isVisionModeType = Config.AGENT_MODE === 'vision'; // In vision mode, force ai_instruction regardless of what the LLM chose @@ -1051,7 +1289,10 @@ async function executeMetaTool( try { const visionUuid = await findElementByVision(mcp, selector, currentScreenshot); // Use appium_click which natively handles ai-element: UUIDs - const clickResult = await mcp.callTool('appium_click', { elementUUID: visionUuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: visionUuid, + }); if (!isMCPError(clickResult)) { tappedViaVision = true; } @@ -1079,7 +1320,10 @@ async function executeMetaTool( if (!uuid && isVisionLocateEnabled()) { try { const visionUuid = await findElementByVision(mcp, selector, currentScreenshot); - const clickResult = await mcp.callTool('appium_click', { elementUUID: visionUuid }); + const clickResult = await mcp.callTool('appium_gesture', { + action: 'tap', + elementUUID: visionUuid, + }); if (!isMCPError(clickResult)) { tappedViaVision = true; } @@ -1099,7 +1343,7 @@ async function executeMetaTool( } } else if (uuid) { // Click the found element to focus/navigate - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); } else if (!tappedViaVision) { return { success: false, @@ -1155,6 +1399,9 @@ async function executeMetaTool( } ui.printStepDetail(`activateApp("${appId}")`); const launched = await activateAppWithFallback(mcp, appId); + if (launched.success && episodicRecorder) { + episodicRecorder.setAppId(appId); + } return { success: launched.success, message: launched.success ? `Launched ${appId}` : launched.message, @@ -1177,17 +1424,14 @@ async function executeMetaTool( return { success: true, message: 'Pressed Enter' }; } } - // Strategy 2: Appium execute script fallback + // Strategy 2: Appium press key fallback try { - await mcp.callTool('appium_execute_script', { - script: 'mobile: shell', - args: [{ command: 'input', args: ['keyevent', '66'] }], - }); + await mcp.callTool('appium_mobile_press_key', { key: 'ENTER' }); return { success: true, message: 'Pressed Enter' }; } catch { return { success: false, - message: 'Failed to press Enter — both ADB and Appium script failed', + message: 'Failed to press Enter — both ADB and Appium press_key failed', }; } } @@ -1241,7 +1485,9 @@ function formatArgs(decision: ToolCallDecision): string { const visionUi = Config.AGENT_MODE === 'vision' && - (decision.toolName === 'find_and_click' || decision.toolName === 'find_and_type'); + (decision.toolName === 'find_and_click' || + decision.toolName === 'find_and_type' || + decision.toolName === 'find_and_long_press'); if (visionUi && args.selector) { const s = String(args.selector); const short = s.length > 90 ? `${s.slice(0, 90)}…` : s; diff --git a/src/agent/planner.ts b/src/agent/planner.ts index 9eabc31..3c4d697 100644 --- a/src/agent/planner.ts +++ b/src/agent/planner.ts @@ -51,12 +51,17 @@ export interface PlannerResult { export async function decomposeGoal( goal: string, model: any, - providerOptions?: Record + providerOptions?: Record, + appGuide?: string ): Promise { + const system = appGuide + ? `${PLANNER_SYSTEM_PROMPT}\n\n--- APP-SPECIFIC KNOWLEDGE ---\nThe following guide describes the target app's UI layout and common actions. Use it to create better, more specific sub-goals that leverage known UI patterns (e.g., prefer the correct gesture or button name from the guide).\n\n${appGuide}` + : PLANNER_SYSTEM_PROMPT; + const { object } = await generateObject({ model, schema: planSchema, - system: PLANNER_SYSTEM_PROMPT, + system, messages: [{ role: 'user', content: goal }], ...(providerOptions ? { providerOptions } : {}), }); @@ -241,16 +246,21 @@ export async function evaluateSubGoal( completedGoals: string[], currentScreenDOM: string, providerOptions?: Record, - screenshot?: string + screenshot?: string, + appGuide?: string ): Promise { // Build message content — include screenshot if available for visual verification + const appGuideSection = appGuide + ? `\nAPP-SPECIFIC KNOWLEDGE:\n${appGuide}\nUse this knowledge when deciding whether to skip, rewrite, or proceed. If the guide describes how to achieve the sub-goal more directly than planned, REWRITE to leverage the known UI patterns.\n` + : ''; + const textContent = `OVERALL GOAL: ${overallGoal} CURRENT SUB-GOAL TO EVALUATE: ${subGoal} COMPLETED SUB-GOALS: ${completedGoals.length > 0 ? completedGoals.map((g, i) => `${i + 1}. ${g}`).join('\n') : '(none)'} - +${appGuideSection} CURRENT SCREEN STATE (DOM): ${currentScreenDOM} @@ -308,11 +318,16 @@ export async function assessScreenReadiness( nextGoal: string, currentScreenDOM: string, providerOptions?: Record, - screenshot?: string + screenshot?: string, + appGuide?: string ): Promise { + const appGuideSection = appGuide + ? `\nAPP-SPECIFIC KNOWLEDGE:\n${appGuide}\nUse this knowledge to understand the app's UI and suggest precise cleanup actions (e.g., specific button names or gestures from the guide).\n` + : ''; + const textContent = `JUST COMPLETED: ${completedGoal} NEXT SUB-GOAL: ${nextGoal} - +${appGuideSection} CURRENT SCREEN STATE (DOM): ${currentScreenDOM} diff --git a/src/agent/preprocessor.ts b/src/agent/preprocessor.ts index 90bbc49..abacf15 100644 --- a/src/agent/preprocessor.ts +++ b/src/agent/preprocessor.ts @@ -15,6 +15,8 @@ export interface PreprocessResult { handled: boolean; action?: string; message?: string; + /** Resolved package / bundle ID when action is 'launch' */ + appId?: string; } /** @@ -43,7 +45,12 @@ export async function preprocessAction( ui.printStepDetail(`activateApp("${packageId}") for "${appName}"`); const r = await activateAppWithFallback(mcp, packageId); if (r.success) { - return { handled: true, action: 'launch', message: `Launched ${appName} (${packageId})` }; + return { + handled: true, + action: 'launch', + message: `Launched ${appName} (${packageId})`, + appId: packageId, + }; } return { handled: false }; } @@ -59,7 +66,7 @@ export async function preprocessAction( // Check if it's a URL if (/^https?:\/\//i.test(appName)) { const browserPkg = appResolver.resolve('chrome') ?? 'com.android.chrome'; - await mcp.callTool('appium_activate_app', { id: browserPkg }); + await mcp.callTool('appium_app_lifecycle', { action: 'activate', id: browserPkg }); return { handled: true, action: 'open_url', message: `Opened browser for ${appName}` }; } @@ -69,7 +76,12 @@ export async function preprocessAction( ui.printStepDetail(`activateApp("${packageId}") for "${appName}"`); const r = await activateAppWithFallback(mcp, packageId); if (r.success) { - return { handled: true, action: 'launch', message: `Launched ${appName} (${packageId})` }; + return { + handled: true, + action: 'launch', + message: `Launched ${appName} (${packageId})`, + appId: packageId, + }; } return { handled: false }; } diff --git a/src/appguides/index.ts b/src/appguides/index.ts new file mode 100644 index 0000000..6a22498 --- /dev/null +++ b/src/appguides/index.ts @@ -0,0 +1,214 @@ +/** + * AppGuide — per-app knowledge injected into the agent's context. + * + * Built-in guides live in this file (keyed by package name / bundle ID). + * Custom guides live in .appclaw/guides/.md — they take priority over built-ins, + * so users can override or extend any guide without touching source code. + */ + +import { readFileSync, existsSync } from 'fs'; +import { join } from 'path'; + +interface AppGuide { + name: string; + content: string; +} + +const GUIDES: Record = { + // ── Gmail ───────────────────────────────────────────────────────────── + 'com.google.android.gm': { + name: 'Gmail', + content: `## Gmail Navigation +- Hamburger menu (top-left) → folders (Inbox, Sent, Drafts, Trash, All Mail) +- Compose button: floating pencil/+ button at bottom-right +- Swipe right on an email → Archive; swipe left → Delete + +## Searching +- Tap the search bar at the top; supports filters: + from:sender@example.com | to:user@example.com | subject:keyword | has:attachment | is:unread + +## Common Actions +- Archive: swipe right on the email row +- Delete: swipe left on the email row +- Select multiple: long-press an email to enter selection mode +- Star: tap the star icon next to the email +- Mark read/unread: long-press → select → tap the envelope icon + +## Composing +- Tap the floating compose button (bottom-right pencil icon) +- Fill To / Subject / Body; attach via paperclip icon; send via paper-plane icon (top-right) + +## Tips +- Primary / Social / Promotions tabs separate email categories +- Labels and filters are in Settings → account → Filters and Blocked Addresses`, + }, + + 'com.google.gmail': { + name: 'Gmail (iOS)', + content: `## Gmail Navigation (iOS) +- Tap the three-line menu (top-left) for folders +- Compose: red pencil button bottom-right +- Swipe left on an email for Archive / Trash options + +## Searching +- Search bar at top; same filters: from: to: subject: has:attachment is:unread + +## Composing +- Tap the pencil button (bottom-right) +- Add recipients, subject, body; attach via paperclip; send via paper-plane icon`, + }, + + // ── YouTube ─────────────────────────────────────────────────────────── + 'com.google.android.youtube': { + name: 'YouTube', + content: `## YouTube Navigation +- Bottom nav: Home | Shorts | + (upload) | Subscriptions | Library +- Search: magnifying-glass icon (top-right) +- Tap a video thumbnail to play; double-tap left/right to seek ±10 s + +## Searching +- Tap the search icon → type query → press Enter or tap the search icon again +- Filter results: tap "Filters" after searching + +## Common Actions +- Like: thumbs-up under the video +- Subscribe: red Subscribe button under/beside the channel name +- Save to playlist: tap ⋮ menu on a video → Save to playlist +- Share: tap the Share button under the video + +## Playback +- Full screen: rotate device or tap the expand icon (bottom-right of player) +- Quality: tap ⋮ inside player → Quality +- Captions: tap CC icon inside player`, + }, + + 'com.google.ios.youtube': { + name: 'YouTube (iOS)', + content: `## YouTube Navigation (iOS) +- Bottom nav: Home | Shorts | + | Subscriptions | Library +- Search: tap the search icon (top-right) +- Tap a thumbnail to play; double-tap sides to seek + +## Common Actions +- Like: thumbs-up below video +- Subscribe: Subscribe button next to channel name +- Save: tap ⋮ on a video → Save to playlist`, + }, + + // ── WhatsApp ────────────────────────────────────────────────────────── + 'com.whatsapp': { + name: 'WhatsApp', + content: `## WhatsApp Navigation +- Bottom tabs: Chats | Updates | Communities | Calls +- New chat: floating pencil/message icon (bottom-right) +- Search: magnifying-glass icon at the top of Chats + +## Messaging +- Open a chat → type in the message bar at the bottom → send via arrow icon +- Attach media: paperclip icon next to message bar +- Voice note: long-press the microphone icon +- Emoji/stickers: smiley face icon on the left of message bar + +## Common Actions +- Star a message: long-press message → star icon +- Forward: long-press message → forward arrow +- Delete: long-press message → trash icon +- Group info: tap the group name at the top of the chat`, + }, + + 'net.whatsapp.WhatsApp': { + name: 'WhatsApp (iOS)', + content: `## WhatsApp Navigation (iOS) +- Bottom tabs: Chats | Updates | Communities | Calls +- New chat: pencil icon (top-right) +- Search: pull down on Chats list + +## Messaging +- Open chat → message bar → send with arrow +- Attach: + icon to the left of the message bar`, + }, + + // ── Chrome ──────────────────────────────────────────────────────────── + 'com.android.chrome': { + name: 'Chrome', + content: `## Chrome Navigation +- Address bar at the top: tap to type a URL or search query, then press Enter +- Back/forward: use device back button or long-press back for history +- Tabs: square icon (top-right) shows open tabs; tap + to open a new tab +- Menu: three-dot icon (top-right) for bookmarks, history, settings, etc. + +## Common Actions +- Bookmark: three-dot menu → Bookmark (star) or tap the star in the address bar +- Share: three-dot menu → Share +- Find in page: three-dot menu → Find in page +- Refresh: circular arrow in the address bar (or pull down on the page) +- Incognito tab: three-dot menu → New Incognito Tab`, + }, + + 'com.google.chrome': { + name: 'Chrome (iOS)', + content: `## Chrome Navigation (iOS) +- Address bar at top: tap → type URL or search → Go +- Tabs: tab count button (bottom-right) +- Three-dot menu (bottom-right) for bookmarks, history, settings`, + }, + + // ── Settings ────────────────────────────────────────────────────────── + 'com.android.settings': { + name: 'Android Settings', + content: `## Settings Navigation +- Use the search bar at the top to find any setting by keyword +- Main sections: Network & internet | Connected devices | Apps | Battery | Display | Sound | Storage | Security | Privacy | Location | Accounts | Accessibility | System + +## Common Paths +- Wi-Fi: Network & internet → Internet +- Bluetooth: Connected devices → Connection preferences → Bluetooth +- Notification settings: Notifications (top-level or via Apps → app name) +- App permissions: Apps → (app name) → Permissions +- Developer options: System → Developer options (enable via Build number tap ×7)`, + }, + + 'com.apple.Preferences': { + name: 'iOS Settings', + content: `## iOS Settings Navigation +- Search bar at the top of the settings list — fastest way to find any setting +- Main sections: Wi-Fi | Bluetooth | Cellular | Notifications | Sounds | Focus | Screen Time | General | Display | Accessibility | Privacy & Security | App Store | Wallet | Passwords | (installed apps at the bottom) + +## Common Paths +- Wi-Fi: Settings → Wi-Fi → toggle or select network +- Bluetooth: Settings → Bluetooth +- App notifications: Settings → Notifications → (app name) +- Location services: Settings → Privacy & Security → Location Services +- Battery: Settings → Battery`, + }, +}; + +/** + * Returns the AppGuide content for the given app ID, or undefined if none found. + * + * Resolution order: + * 1. .appclaw/guides/.md (user custom — wins over built-ins) + * 2. Built-in GUIDES map + */ +export function loadAppGuide(appId: string): string | undefined { + if (!appId) return undefined; + + // 1. User custom guide + const customPath = join(process.cwd(), '.appclaw', 'guides', `${appId}.md`); + if (existsSync(customPath)) { + const content = readFileSync(customPath, 'utf-8').trim(); + if (content) return `APP_GUIDE (${appId}):\n${content}`; + } + + // 2. Built-in guide + const guide = GUIDES[appId]; + if (!guide) return undefined; + return `APP_GUIDE (${guide.name}):\n${guide.content}`; +} + +/** Returns true if an AppGuide exists for the given app ID (built-in or custom). */ +export function hasAppGuide(appId: string): boolean { + if (!appId) return false; + const customPath = join(process.cwd(), '.appclaw', 'guides', `${appId}.md`); + return existsSync(customPath) || appId in GUIDES; +} diff --git a/src/config.ts b/src/config.ts index 62308f0..594e43b 100644 --- a/src/config.ts +++ b/src/config.ts @@ -51,6 +51,8 @@ const envSchema = z.object({ STEP_DELAY: z.coerce.number().default(500), MAX_ELEMENTS: z.coerce.number().default(40), MAX_HISTORY_STEPS: z.coerce.number().default(10), + /** Milliseconds before an LLM request is aborted. Default 60 s. Set to 0 to disable. */ + LLM_REQUEST_TIMEOUT_MS: z.coerce.number().default(60_000), VISION_MODE: z.enum(['always', 'fallback', 'never']).default('fallback'), LOG_DIR: z.string().default('logs'), diff --git a/src/device/device-picker.ts b/src/device/device-picker.ts index 620bbae..ce48c99 100644 --- a/src/device/device-picker.ts +++ b/src/device/device-picker.ts @@ -44,7 +44,17 @@ export async function discoverAndSelectDevice( deviceName: string | null, forceDevicePicker: boolean = false ): Promise { - // Step 1: Call select_platform to discover available devices + // Fast path: UDID already known — skip device enumeration entirely. + // select_device accepts deviceUdid directly and will bypass the slow list call. + if (udid && !forceDevicePicker) { + ui.startSpinner(`Selecting ${platform} device...`); + await selectDeviceOnMcp(mcp, platform, deviceType, udid); + ui.stopSpinner(); + ui.printSetupOk(`Selected device ${udid}`); + return { device: { name: udid, udid }, platform, deviceType }; + } + + // Step 1: Call select_device to discover available devices ui.startSpinner(`Discovering ${platform} devices...`); const selectPlatformArgs: Record = { platform }; @@ -52,7 +62,7 @@ export async function discoverAndSelectDevice( selectPlatformArgs.iosDeviceType = deviceType; } - const platformResult = await mcp.callTool('select_platform', selectPlatformArgs); + const platformResult = await mcp.callTool('select_device', selectPlatformArgs); const platformText = extractText(platformResult); ui.stopSpinner(); diff --git a/src/device/index.ts b/src/device/index.ts index 09ccb75..0f4628b 100644 --- a/src/device/index.ts +++ b/src/device/index.ts @@ -27,6 +27,11 @@ export interface DeviceSetupArgs { cliUdid: string | null; cliDeviceName: string | null; config: AppClawConfig; + /** + * Always show the device picker even when a single device is available or the platform + * is pre-selected. Used by playground mode so the user always gets to choose a device. + */ + alwaysPickDevice?: boolean; /** * Extra Appium capabilities merged into the session for this specific device. * Used by parallel runners to assign unique ports per worker: @@ -93,7 +98,9 @@ export async function setupDevice( // so the user can choose which device they want. Only auto-select when explicitly set. const explicitDevice = !!(udid || deviceName); const explicitPlatform = !!(args.cliPlatform || args.config.PLATFORM); - const forceDevicePicker = !explicitDevice && !explicitPlatform; + // Force picker when: no device/platform specified interactively, OR caller explicitly requests it + const forceDevicePicker = + (!explicitDevice && !explicitPlatform) || (!!args.alwaysPickDevice && !explicitDevice); const selection = await discoverAndSelectDevice( mcp, diff --git a/src/device/session.ts b/src/device/session.ts index c2a9675..c2fc12b 100644 --- a/src/device/session.ts +++ b/src/device/session.ts @@ -65,7 +65,10 @@ export async function createPlatformSession( } try { - const sessionResult = await mcp.callTool('create_session', args); + const sessionResult = await mcp.callTool('appium_session_management', { + action: 'create', + ...args, + }); const resultText = extractText(sessionResult); if (resultText.toLowerCase().includes('error') || resultText.toLowerCase().includes('failed')) { @@ -151,7 +154,10 @@ async function createLambdaTestSession( }; try { - const sessionResult = await mcp.callTool('create_session', args); + const sessionResult = await mcp.callTool('appium_session_management', { + action: 'create', + ...args, + }); const resultText = extractText(sessionResult); if (resultText.toLowerCase().includes('error') || resultText.toLowerCase().includes('failed')) { @@ -214,7 +220,7 @@ async function detectScreenSize(mcp: MCPClient, platform: Platform): Promise c.text ?? '') + .join('') + .trim(); + if (foundUuid) { + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: foundUuid }); + } await sleep(options.stepDelayMs); // Capture new screen @@ -211,7 +218,7 @@ export async function crawlApp( } // Navigate back to the original screen - await mcp.callTool('appium_press_back', {}); + await mcp.callTool('appium_mobile_press_key', { key: 'BACK' }); await sleep(options.stepDelayMs); // Verify we're back on the expected screen @@ -219,7 +226,7 @@ export async function crawlApp( const backId = findMatchingScreen(backState.dom, screens); if (backId !== screenId) { // Not back on the expected screen — try one more back - await mcp.callTool('appium_press_back', {}); + await mcp.callTool('appium_mobile_press_key', { key: 'BACK' }); await sleep(options.stepDelayMs); } } catch { diff --git a/src/flow/llm-parser.ts b/src/flow/llm-parser.ts index 9d438d8..2d1ac82 100644 --- a/src/flow/llm-parser.ts +++ b/src/flow/llm-parser.ts @@ -14,6 +14,11 @@ import type { FlowStep } from './types.js'; const stepSchema = z.discriminatedUnion('kind', [ z.object({ kind: z.literal('openApp'), query: z.string().describe('App name to open') }), z.object({ kind: z.literal('tap'), label: z.string().describe('Element label/text to tap') }), + z.object({ + kind: z.literal('longPress'), + label: z.string().describe('Element label/text to long-press'), + duration: z.number().optional().describe('Hold duration in ms, default 2000'), + }), z.object({ kind: z.literal('type'), text: z.string().describe('Text to type'), @@ -58,6 +63,7 @@ const SYSTEM_PROMPT = `Rules:\n` + `- "open/launch/start " → openApp\n` + `- "click/tap/press/select " → tap\n` + + `- "long press/long-press/press and hold " → longPress\n` + `- "type/enter/input " or "search for " → type\n` + `- "wait for to be visible/appear" → waitUntil (visible)\n` + `- "wait for to disappear/be gone" → waitUntil (gone)\n` + diff --git a/src/flow/natural-line.ts b/src/flow/natural-line.ts index 1e5f556..6facdda 100644 --- a/src/flow/natural-line.ts +++ b/src/flow/natural-line.ts @@ -39,6 +39,24 @@ export function tryParseNaturalFlowLine(line: string): FlowStep | null { if (label) return { kind: 'tap', label, verbatim }; } + // "long press X" / "long-press X" / "long tap X" / "press and hold X" + const longPressMatch = t.match( + /^(?:long[\s-]press|long[\s-]tap|press\s+and\s+hold)(?:\s+on)?\s+(?:the\s+)?(.+?)(?:\s+for\s+(\d+(?:\.\d+)?)\s*(?:ms|milliseconds?|s|seconds?))?$/i + ); + if (longPressMatch) { + const label = trimPunct(longPressMatch[1].trim()); + const durRaw = longPressMatch[2]; + const durUnit = + longPressMatch[0].match(/(\d+(?:\.\d+)?)\s*(ms|milliseconds?|s|seconds?)$/i)?.[2] ?? 'ms'; + const duration = durRaw + ? durUnit.startsWith('s') + ? Math.round(Number(durRaw) * 1000) + : Math.round(Number(durRaw)) + : undefined; + if (label) + return { kind: 'longPress', label, ...(duration != null ? { duration } : {}), verbatim }; + } + const clickMatch = t.match(/^(?:click|tap|select|choose|pick)(?:\s+on)?\s+(?:the\s+)?(.+)$/i); if (clickMatch) { const label = trimPunct(clickMatch[1].trim()); diff --git a/src/flow/parallel-runner.ts b/src/flow/parallel-runner.ts index 9a74a24..7ee5ca6 100644 --- a/src/flow/parallel-runner.ts +++ b/src/flow/parallel-runner.ts @@ -90,7 +90,7 @@ async function discoverDevices( const args: Record = { platform }; if (platform === 'ios' && deviceType) args.iosDeviceType = deviceType; - const result = await mcp.callTool('select_platform', args); + const result = await mcp.callTool('select_device', args); const text = extractText(result); const devices = parseDeviceList(text, platform); @@ -234,7 +234,10 @@ async function runWorkerJob( ); try { - await scopedMcp.callTool('delete_session', { sessionId: deviceResult.sessionId }); + await scopedMcp.callTool('appium_session_management', { + action: 'delete', + sessionId: deviceResult.sessionId, + }); } catch { /* ignore */ } @@ -257,7 +260,10 @@ async function runWorkerJob( const message = err instanceof Error ? err.message : String(err); console.error(`${label} ${chalk.red('error')} — ${job.flowFile}: ${message}`); try { - await scopedMcp.callTool('delete_session', { sessionId: deviceResult.sessionId }); + await scopedMcp.callTool('appium_session_management', { + action: 'delete', + sessionId: deviceResult.sessionId, + }); } catch { /* ignore */ } diff --git a/src/flow/run-yaml-flow.ts b/src/flow/run-yaml-flow.ts index c6cad61..9b93fac 100644 --- a/src/flow/run-yaml-flow.ts +++ b/src/flow/run-yaml-flow.ts @@ -9,7 +9,7 @@ */ import type { MCPClient } from '../mcp/types.js'; -import { getPageSource } from '../mcp/tools.js'; +import { getPageSource, findElementByVision } from '../mcp/tools.js'; import { activateAppWithFallback } from '../mcp/activate-app.js'; import { detectDeviceUdid, typeViaKeyboard, typeViaSetValue } from '../mcp/keyboard.js'; import { detectPlatform } from '../perception/screen.js'; @@ -136,6 +136,8 @@ function stepLabel(step: FlowStep): string { return `wait until "${step.text}" is visible (${step.timeoutSeconds}s timeout)`; case 'tap': return `tap "${step.label}"`; + case 'longPress': + return `long-press "${step.label}"${step.duration != null ? ` (${step.duration}ms)` : ''}`; case 'type': return `type "${step.text.length > 40 ? `${step.text.slice(0, 37)}…` : step.text}"`; case 'enter': @@ -198,12 +200,9 @@ async function pressEnterKey(mcp: MCPClient): Promise { /* try next strategy */ } - // Strategy 2: mobile: shell via appium_execute_script (Android) + // Strategy 2: appium_mobile_press_key ENTER fallback try { - await mcp.callTool('appium_execute_script', { - script: 'mobile: shell', - args: [{ command: 'input', args: ['keyevent', '66'] }], - }); + await mcp.callTool('appium_mobile_press_key', { key: 'ENTER' }); return { success: true, message: 'Pressed Enter' }; } catch { /* try next strategy */ @@ -250,7 +249,7 @@ async function tryTapByVision(mcp: MCPClient, label: string): Promise { + // Vision mode: locate via df-vision → coordinate-based long press + if (isVisionMode() || isVisionLocateEnabled()) { + try { + const visionUuid = await findElementByVision(mcp, label); + const coords = parseAIElementCoords(visionUuid); + if (coords) { + await mcp.callTool('appium_gesture', { + action: 'long_press', + x: coords.x, + y: coords.y, + duration, + }); + return { + success: true, + message: `Long-pressed "${label}" via vision at [${coords.x}, ${coords.y}] (${duration}ms)`, + }; + } + } catch { + // Fall through to DOM + } + } + + // DOM mode: find element UUID → long press by UUID + const pageSource = await getPageSource(mcp); + const platform = detectPlatform(pageSource); + const elements = + platform === 'android' ? parseAndroidPageSource(pageSource) : parseIOSPageSource(pageSource); + + const scored = elements + .map((el) => ({ el, s: scoreTapMatch(el, label) })) + .filter((x) => x.s >= 0) + .sort((a, b) => b.s - a.s); + + const pick = scored[0]?.el; + if (!pick) return { success: false, message: `No matching element for "${label}"` }; + + const uuid = await findByIdStrategies(mcp, pick.accessibilityId || pick.id, pick.text); + if (!uuid) return { success: false, message: `Found "${label}" but could not locate element` }; + + await mcp.callTool('appium_gesture', { action: 'long_press', elementUUID: uuid, duration }); + return { success: true, message: `Long-pressed "${label}" (${duration}ms)` }; +} + async function flowTypeText( mcp: MCPClient, text: string, @@ -347,7 +395,7 @@ async function flowTypeText( const coords = parseAIElementCoords(visionUuid); if (coords) await tapAtCoordinates(mcp, coords.x, coords.y); } else { - await mcp.callTool('appium_click', { elementUUID: visionUuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: visionUuid }); } } } @@ -393,7 +441,7 @@ async function flowTypeText( if (!uuid) { return { success: false, message: 'Could not resolve editable element' }; } - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); await mcp.callTool('appium_clear_element', { elementUUID: uuid }).catch(() => {}); const setResult = await mcp.callTool('appium_set_value', { ...(Config.CLOUD_PROVIDER ? { w3cActions: true } : { elementUUID: uuid }), @@ -747,7 +795,7 @@ async function scrollUntilVisible( } for (let scroll = 0; scroll < maxScrolls; scroll++) { - await mcp.callTool('appium_scroll', { direction }); + await mcp.callTool('appium_gesture', { action: 'scroll', direction }); await sleep(800); if (await isVisible()) { @@ -928,6 +976,8 @@ export async function executeStep( return waitUntilCondition(mcp, step.condition, step.text, step.timeoutSeconds, tapPoll); case 'tap': return tapByLabel(mcp, step.label, tapPoll); + case 'longPress': + return longPressByLabel(mcp, step.label, step.duration); case 'type': return flowTypeText(mcp, step.text, step.target, deviceUdid); case 'enter': @@ -939,13 +989,15 @@ export async function executeStep( await mcp.callTool('appium_mobile_press_key', { key: 'HOME' }); return { success: true, message: 'Home' }; case 'swipe': { - // appium_scroll only supports up/down; use appium_swipe for left/right const dir = step.direction; const count = step.repeat ?? 1; - const toolName = dir === 'left' || dir === 'right' ? 'appium_swipe' : 'appium_scroll'; + const gestureAction = dir === 'left' || dir === 'right' ? 'swipe' : 'scroll'; let lastError = ''; for (let i = 0; i < count; i++) { - const result = await mcp.callTool(toolName, { direction: dir }); + const result = await mcp.callTool('appium_gesture', { + action: gestureAction, + direction: dir, + }); const text = result.content ?.map((c: { type: string; text?: string }) => (c.type === 'text' ? c.text : '')) diff --git a/src/flow/types.ts b/src/flow/types.ts index 48a2d9f..efeceae 100644 --- a/src/flow/types.ts +++ b/src/flow/types.ts @@ -65,6 +65,7 @@ export type FlowStep = timeoutSeconds: number; } & Verbatim) | ({ kind: 'tap'; label: string } & Verbatim) + | ({ kind: 'longPress'; label: string; duration?: number } & Verbatim) | ({ kind: 'type'; text: string; target?: string } & Verbatim) | ({ kind: 'enter' } & Verbatim) | ({ kind: 'back' } & Verbatim) diff --git a/src/flow/vision-execute.ts b/src/flow/vision-execute.ts index 5143221..3d75288 100644 --- a/src/flow/vision-execute.ts +++ b/src/flow/vision-execute.ts @@ -99,22 +99,51 @@ function parseJsonLenient(text: string): unknown { /* continue */ } + // Repair: LLMs sometimes omit the closing quote on a JSON key name before ':', + // emitting "key:[value] instead of "key":[value] + // The regex only fires at key positions (after '{' or ',') so it cannot corrupt + // string values that happen to contain a colon. + const repaired = cleaned.replace(/(?<=[{,]\s*)"([A-Za-z_][A-Za-z0-9_]*):/g, '"$1":'); + try { + return JSON.parse(repaired); + } catch { + /* continue */ + } + // Lenient path: extract the first balanced JSON object/array substring. + // Build `starts` with string-context awareness so that '[' or '{' characters + // inside string values are not mistaken for the start of a JSON structure. + // (Without this, a malformed key like "key:[926,357] would cause the inner '[' to + // be treated as a start, and [926,357] would be returned instead of the real object.) const starts: number[] = []; - for (let i = 0; i < cleaned.length; i++) { - const ch = cleaned[i]; - if (ch === '{' || ch === '[') starts.push(i); + { + let inStr = false; + let esc = false; + for (let i = 0; i < repaired.length; i++) { + const ch = repaired[i]; + if (inStr) { + if (esc) esc = false; + else if (ch === '\\') esc = true; + else if (ch === '"') inStr = false; + continue; + } + if (ch === '"') { + inStr = true; + continue; + } + if (ch === '{' || ch === '[') starts.push(i); + } } for (const start of starts) { - const open = cleaned[start]; + const open = repaired[start]; const close = open === '{' ? '}' : ']'; let depth = 0; let inString = false; let escaped = false; - for (let i = start; i < cleaned.length; i++) { - const ch = cleaned[i]; + for (let i = start; i < repaired.length; i++) { + const ch = repaired[i]; if (inString) { if (escaped) { @@ -136,7 +165,7 @@ function parseJsonLenient(text: string): unknown { if (ch === close) depth--; if (depth === 0) { - const candidate = cleaned.slice(start, i + 1); + const candidate = repaired.slice(start, i + 1); try { return JSON.parse(candidate); } catch { @@ -213,7 +242,10 @@ async function anchorVisibleInVision( } /** Actions from combinedInstructionPrompt that map to tap/click. */ -const TAP_ACTIONS = new Set(['click', 'tap', 'touch', 'select', 'long press', 'longpress']); +const TAP_ACTIONS = new Set(['click', 'tap', 'touch', 'select']); + +/** Actions that map to long press. */ +const LONG_PRESS_ACTIONS = new Set(['long press', 'longpress', 'long-press', 'press and hold']); /** Actions that map to type/enter text. */ const TYPE_ACTIONS = new Set(['enter', 'type', 'send', 'sendkeys', 'set', 'set value']); @@ -352,6 +384,25 @@ function preCheck(instruction: string): PreCheckResult | null { return { step: { kind: 'enter', verbatim: t } }; } + // 5b. long press (natural language — route to longPress step kind) + const longPressMatch = t.match( + /^(?:long[\s-]press|long[\s-]tap|press\s+and\s+hold)(?:\s+on)?\s+(?:the\s+)?(.+?)(?:\s+for\s+(\d+(?:\.\d+)?)\s*(ms|milliseconds?|s|seconds?))?$/i + ); + if (longPressMatch) { + const label = longPressMatch[1].replace(/[.!?]+$/g, '').trim(); + const durRaw = longPressMatch[2]; + const durUnit = longPressMatch[3] ?? 'ms'; + const duration = durRaw + ? durUnit.startsWith('s') + ? Math.round(Number(durRaw) * 1000) + : Math.round(Number(durRaw)) + : undefined; + if (label) + return { + step: { kind: 'longPress', label, ...(duration != null ? { duration } : {}), verbatim: t }, + }; + } + // 6. Visibility assert — any instruction starting with an assert/verify verb, // or "is X visible?" pattern. Pass the full instruction to the vision model // as-is — let the LLM interpret what to check instead of brittle regex parsing. @@ -616,10 +667,8 @@ export async function visionExecute( if (!hasElementCoords) { const step: FlowStep = { kind: 'swipe', direction, verbatim: instruction }; - // appium_scroll only supports up/down; use appium_swipe for left/right - const scrollTool = - direction === 'left' || direction === 'right' ? 'appium_swipe' : 'appium_scroll'; - await mcp.callTool(scrollTool, { direction }); + const gestureAction = direction === 'left' || direction === 'right' ? 'swipe' : 'scroll'; + await mcp.callTool('appium_gesture', { action: gestureAction, direction }); return { step, result: { success: true, message: `Swiped ${direction}` } }; } // Has element coords — fall through to element-targeted swipe below @@ -862,6 +911,16 @@ export async function visionExecute( }; } + // Long press — LLM classified this as "long press" + if (LONG_PRESS_ACTIONS.has(actionName)) { + const label = locators[0]?.element || instruction; + const step: FlowStep = { kind: 'longPress', label, verbatim: instruction }; + return { + step, + result: { success: true, message: '__needs_executeStep__' }, + }; + } + // Tap/click (default for most actions) if (TAP_ACTIONS.has(actionName) || locators.length > 0) { const label = locators[0]?.element || instruction; diff --git a/src/index.ts b/src/index.ts index 9e34df4..dd62e17 100644 --- a/src/index.ts +++ b/src/index.ts @@ -43,6 +43,7 @@ import { runExplorer } from './explorer/index.js'; import type { ExplorerConfig } from './explorer/types.js'; import { runPlayground } from './playground/index.js'; import { setupDevice } from './device/index.js'; +import { loadAppGuide } from './appguides/index.js'; import * as ui from './ui/terminal.js'; import { silenceTerminalUI } from './ui/terminal.js'; import { enableJsonMode, isJsonMode, emitJson } from './json-emitter.js'; @@ -199,7 +200,14 @@ function printHelp(): void { } function parseArgs(): CLIArgs { - const args = process.argv.slice(2); + // Normalize --flag=value into ['--flag', 'value'] so both forms work + const args = process.argv.slice(2).flatMap((arg) => { + if (arg.startsWith('--') && arg.includes('=')) { + const eq = arg.indexOf('='); + return [arg.slice(0, eq), arg.slice(eq + 1)]; + } + return [arg]; + }); if (args.includes('--help') || args.includes('-h')) { printHelp(); @@ -745,7 +753,7 @@ async function main() { }, }); try { - await mcp.callTool('delete_session', {}); + await mcp.callTool('appium_session_management', { action: 'delete' }); } catch { /* ignore */ } @@ -759,7 +767,7 @@ async function main() { data: { success: false, stepsExecuted: 0, stepsTotal: 0, reason: msg }, }); try { - await mcp.callTool('delete_session', {}); + await mcp.callTool('appium_session_management', { action: 'delete' }); } catch { /* ignore */ } @@ -854,12 +862,32 @@ async function main() { const appResolver = new AppResolver(); await appResolver.initialize(agentScopedMcp, resolvedPlatform); + // ── Detect app ID early — needed for AppGuide in planner + orchestrator ── + let journeyAppId: string | undefined; + try { + const { extractAppIdFromText } = await import('./memory/fingerprint.js'); + journeyAppId = extractAppIdFromText(goal); + if (!journeyAppId) { + const appMatch = goal.match( + /(?:open|launch|start)\s+(?:the\s+)?(\w[\w\s]*?)(?:\s+app|\s+and\b)/i + ); + if (appMatch) { + journeyAppId = appResolver.resolve(appMatch[1].trim()) ?? undefined; + } + } + } catch { + // Non-critical + } + + // Load AppGuide for the target app (if known) — shared by planner, orchestrator, and agent + const journeyAppGuide = journeyAppId ? loadAppGuide(journeyAppId) : undefined; + // ─── Always decompose goals into sub-goals ───────── ui.printPlanStart(); const plannerModel = buildModel(config); const thinkingOptions = buildThinkingOptions(config); - const planResult = await decomposeGoal(goal, plannerModel, thinkingOptions); + const planResult = await decomposeGoal(goal, plannerModel, thinkingOptions, journeyAppGuide); const executor = createPlanExecutor(planResult.subGoals); ui.stopSpinner(); @@ -887,26 +915,6 @@ async function main() { let journeyCost = 0; const allHistory: any[] = []; - // ── Episodic memory: detect app ID for the journey ── - // Try to resolve the primary app from the goal so all sub-goals share it. - let journeyAppId: string | undefined; - try { - const { extractAppIdFromText } = await import('./memory/fingerprint.js'); - // First try the raw goal for package names - journeyAppId = extractAppIdFromText(goal); - // If not found, try resolving app names from the goal (e.g., "YouTube" → "com.google.android.youtube") - if (!journeyAppId) { - const appMatch = goal.match( - /(?:open|launch|start)\s+(?:the\s+)?(\w[\w\s]*?)(?:\s+app|\s+and\b)/i - ); - if (appMatch) { - journeyAppId = appResolver.resolve(appMatch[1].trim()) ?? undefined; - } - } - } catch { - // Non-critical - } - while (!executor.isDone()) { const subGoal = executor.current!; @@ -960,7 +968,8 @@ async function main() { subGoal.goal, orchestratorDom, thinkingOptions, - orchestratorScreenshot + orchestratorScreenshot, + journeyAppGuide ) : Promise.resolve({ ready: true, issues: [] as string[] } as { ready: boolean; @@ -974,7 +983,8 @@ async function main() { completedGoalsList, orchestratorDom, thinkingOptions, - orchestratorScreenshot + orchestratorScreenshot, + journeyAppGuide ), ]); @@ -1153,7 +1163,7 @@ async function main() { if (recorder) recorder.save(allDone); try { - await mcpClient.callTool('delete_session', {}); + await mcpClient.callTool('appium_session_management', { action: 'delete' }); } catch { /* ignore */ } @@ -1167,7 +1177,7 @@ async function main() { }); ui.printError('Fatal error', err?.message ?? String(err)); try { - await mcpClient.callTool('delete_session', {}); + await mcpClient.callTool('appium_session_management', { action: 'delete' }); } catch { /* ignore */ } diff --git a/src/llm/prompts.ts b/src/llm/prompts.ts index af08a1a..2afcd3c 100644 --- a/src/llm/prompts.ts +++ b/src/llm/prompts.ts @@ -25,6 +25,7 @@ HOW TO INTERACT (DOM MODE) **To tap an element:** Use find_and_click(strategy, selector) — finds and clicks in ONE step. **To type into a field:** Use find_and_type(strategy, selector, text) — finds, clicks, clears, and types in ONE step. +**To long-press an element (context menu, drag, swipe-to-delete):** Use find_and_long_press(strategy, selector) — finds and long-presses in ONE step. **Locator strategies:** @@ -62,6 +63,11 @@ HOW TO INTERACT (VISION MODE) Example: find_and_type(selector="text input field labeled 'To' at the top", text="user@example.com") Example: find_and_type(selector="large text area below the subject line", text="Hello") +**To long-press an element (context menu, drag, swipe-to-delete):** Use find_and_long_press and describe what you SEE. + Example: find_and_long_press(selector="Medium Daily Digest email row", tapX=500, tapY=270) + Example: find_and_long_press(selector="file icon labeled report.pdf", duration=1500) + Do NOT use appium_gesture or any raw Appium tool for long press — always use find_and_long_press. + **SPEED BOOST — provide tap coordinates:** If you can estimate WHERE the element is in the screenshot, include tapX and tapY. Use normalized 0-1000 scale: (0,0) is top-left, (1000,1000) is bottom-right. @@ -148,11 +154,13 @@ export function buildSystemPrompt( ? ` **Primary tools — use strategy="ai_instruction" with a visual description:** - find_and_click: Visually find + click in one step. -- find_and_type: Visually find + click + type text in one step.` +- find_and_type: Visually find + click + type text in one step. +- find_and_long_press: Visually find + long-press in one step (context menus, drag initiation).` : ` **Primary tools:** - find_and_click: Find element + click in one step (strategy + selector). -- find_and_type: Find element + click + type text in one step (strategy + selector + text).`; +- find_and_type: Find element + click + type text in one step (strategy + selector + text). +- find_and_long_press: Find element + long-press in one step (strategy + selector + optional duration).`; // Vision fallback section for DOM mode const visionFallback = @@ -232,6 +240,13 @@ export function buildUserMessage(context: AgentContext): string { parts.push(`\n${context.pastExperience}`); } + // ── AppGuide: per-app navigation knowledge ──────────── + // Injected when the foreground app is recognised — gives the agent + // app-specific navigation patterns so it doesn't have to rediscover them. + if (context.appGuide) { + parts.push(`\n${context.appGuide}`); + } + // ── Contextual hints ────────────────────────────────── // Targeted micro-reminders based on current state. Additive only — // these reinforce existing rules when they matter most. diff --git a/src/llm/provider.ts b/src/llm/provider.ts index 638cf53..e389842 100644 --- a/src/llm/provider.ts +++ b/src/llm/provider.ts @@ -53,6 +53,8 @@ export interface AgentContext { failedOnScreen?: string; /** Episodic memory: relevant past experience from previous successful runs */ pastExperience?: string; + /** AppGuide: per-app navigation knowledge injected when a known app is in the foreground */ + appGuide?: string; } /** Token usage for a single LLM call */ @@ -190,6 +192,58 @@ function buildMetaTools(agentMode: 'dom' | 'vision'): Record { }), }); + const findAndLongPressVision = tool({ + description: + 'Long-press something on screen using AI vision (press and hold to open context menus, trigger drag, etc.). ' + + 'Describe what you SEE in plain language — visible text, icon shape, color, position. ' + + 'Do NOT use xpath, resource IDs, or element UUIDs. ' + + 'If you can estimate the location, provide tapX and tapY (normalized 0-1000) to skip the vision-locate step.', + inputSchema: z.object({ + selector: z + .string() + .describe( + 'Plain-language target, e.g. Medium Daily Digest email row, red unread notification dot' + ), + tapY: z + .number() + .optional() + .describe('Estimated Y position in normalized 0-1000 scale (0=top, 1000=bottom)'), + tapX: z + .number() + .optional() + .describe('Estimated X position in normalized 0-1000 scale (0=left, 1000=right)'), + duration: z + .number() + .int() + .optional() + .describe('Hold duration in milliseconds (default 2000, range 500-10000)'), + bounds: z + .string() + .optional() + .describe('Optional [x1,y1][x2,y2] center fallback if vision fails'), + }), + }); + + const findAndLongPressDom = tool({ + description: + 'Find an element and long-press it (press and hold) in one step. ' + + 'Use EXACT locator values from the DOM. ALWAYS include bounds from the DOM as fallback. ' + + 'Use for context menus, drag initiation, or any press-and-hold interaction.', + inputSchema: z.object({ + strategy: z.enum(['accessibility id', 'id', 'xpath']).describe('Locator strategy'), + selector: z.string().describe('Locator value — MUST be the EXACT, FULL string from the DOM'), + duration: z + .number() + .int() + .optional() + .describe('Hold duration in milliseconds (default 2000, range 500-10000)'), + bounds: z + .string() + .optional() + .describe('Element bounds from DOM e.g. [x1,y1][x2,y2] — used as coordinate fallback'), + }), + }); + const findAndClickDom = tool({ description: 'Find an element and click it in one step. ' + @@ -250,6 +304,8 @@ function buildMetaTools(agentMode: 'dom' | 'vision'): Record { find_and_type: agentMode === 'vision' ? findAndTypeVision : findAndTypeDom, + find_and_long_press: agentMode === 'vision' ? findAndLongPressVision : findAndLongPressDom, + launch_app: tool({ description: 'Launch/activate an app by package name (Android) or bundle ID (iOS). ' + @@ -424,6 +480,13 @@ export function createLLMProvider(config: AppClawConfig, mcpTools: MCPToolInfo[] }, ]; + // Request timeout — abort if the LLM takes too long (hangs on preview models). + const timeoutMs = config.LLM_REQUEST_TIMEOUT_MS; + const abortController = timeoutMs > 0 ? new AbortController() : undefined; + const abortTimer = abortController + ? setTimeout(() => abortController.abort(), timeoutMs) + : undefined; + // Use streaming when callbacks are provided for live reasoning display. // Single streamText call with tools — streams any reasoning text the model // emits before its tool call, then extracts the tool call from the final result. @@ -435,6 +498,7 @@ export function createLLMProvider(config: AppClawConfig, mcpTools: MCPToolInfo[] tools: allTools, toolChoice: 'required' as const, ...(thinkingOptions ? { providerOptions: thinkingOptions } : {}), + ...(abortController ? { abortSignal: abortController.signal } : {}), messages, }); @@ -490,6 +554,7 @@ export function createLLMProvider(config: AppClawConfig, mcpTools: MCPToolInfo[] const toolCall = toolCalls?.[0]; if (!toolCall) { + clearTimeout(abortTimer); return { toolName: 'done', args: { reason: text || reasoningText || 'No action decided' }, @@ -501,6 +566,7 @@ export function createLLMProvider(config: AppClawConfig, mcpTools: MCPToolInfo[] const toolArgs = 'args' in toolCall ? (toolCall as any).args : (toolCall as any).input; lastToolName = toolCall.toolName; + clearTimeout(abortTimer); return { toolName: toolCall.toolName, @@ -517,8 +583,10 @@ export function createLLMProvider(config: AppClawConfig, mcpTools: MCPToolInfo[] tools: allTools, toolChoice: 'required' as const, ...(thinkingOptions ? { providerOptions: thinkingOptions } : {}), + ...(abortController ? { abortSignal: abortController.signal } : {}), messages, }); + clearTimeout(abortTimer); // Prefer totalUsage + raw Gemini usageMetadata — some models omit fields the SDK maps to 0 const extracted = extractUsageFromGenerateTextResult(result); diff --git a/src/mcp/activate-app.ts b/src/mcp/activate-app.ts index 49c29d6..c0df752 100644 --- a/src/mcp/activate-app.ts +++ b/src/mcp/activate-app.ts @@ -43,7 +43,7 @@ export async function activateAppWithFallback( mcp: MCPClient, packageId: string ): Promise<{ success: boolean; message: string }> { - const primary = await mcp.callTool('appium_activate_app', { id: packageId }); + const primary = await mcp.callTool('appium_app_lifecycle', { action: 'activate', id: packageId }); const t0 = extractText(primary); if (!responseLooksLikeFailure(t0)) { return { success: true, message: t0.slice(0, 240) || `Activated ${packageId}` }; @@ -51,34 +51,16 @@ export async function activateAppWithFallback( const url = DEEP_LINK_BY_PACKAGE[packageId]; if (url) { - const deepVariants: Record[] = [ - { url, appId: packageId }, - { url, package: packageId }, - ]; - for (const deepArgs of deepVariants) { - const t1 = await callToolQuiet(mcp, 'appium_deep_link', deepArgs); - if (t1 !== null && !responseLooksLikeFailure(t1)) { - return { - success: true, - message: `Deep link opened ${packageId}: ${t1.slice(0, 160)}`, - }; - } - } - - const t2 = await callToolQuiet(mcp, 'appium_execute_script', { - script: 'mobile: deepLink', - args: [{ url, package: packageId }], - }); - if (t2 !== null && !responseLooksLikeFailure(t2)) { - return { success: true, message: `mobile:deepLink: ${t2.slice(0, 200)}` }; - } - - const t3 = await callToolQuiet(mcp, 'appium_execute_script', { - script: 'mobile: deepLink', - args: [{ url, appPackage: packageId }], + const t1 = await callToolQuiet(mcp, 'appium_app_lifecycle', { + action: 'deep_link', + url, + id: packageId, }); - if (t3 !== null && !responseLooksLikeFailure(t3)) { - return { success: true, message: `mobile:deepLink: ${t3.slice(0, 200)}` }; + if (t1 !== null && !responseLooksLikeFailure(t1)) { + return { + success: true, + message: `Deep link opened ${packageId}: ${t1.slice(0, 160)}`, + }; } } diff --git a/src/mcp/client.ts b/src/mcp/client.ts index 1ccc5e5..ef0094e 100644 --- a/src/mcp/client.ts +++ b/src/mcp/client.ts @@ -1,3 +1,4 @@ +import { createRequire } from 'module'; import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js'; @@ -5,6 +6,29 @@ import type { MCPClient, MCPConfig, MCPToolResult, MCPToolInfo } from './types.j import { theme } from '../ui/terminal.js'; import { VERSION } from '../version.js'; +/** + * Resolve the appium-mcp binary. + * + * Prefer the locally-installed package (bundled as a dependency) so the + * MCP server starts immediately — no npm download at connect time. + * The MCP SDK's initialize handshake has a hardcoded 60 s timeout that + * fires before npx can download a missing package in slow CI environments. + * + * Falls back to npx for backwards compatibility (e.g. very old global installs + * that pre-date appium-mcp being a listed dependency). + */ +function resolveAppiumMcp(): { command: string; args: string[] } { + try { + const req = createRequire(import.meta.url); + const bin = req.resolve('appium-mcp'); + return { command: 'node', args: [bin] }; + } catch { + return { command: 'npx', args: ['--yes', 'appium-mcp@1.67.0'] }; + } +} + +const appiumMcp = resolveAppiumMcp(); + /** Tools that produce verbose output we don't want to log */ const QUIET_TOOLS = new Set(['appium_get_page_source', 'appium_screenshot', 'appium_list_apps']); @@ -49,9 +73,8 @@ async function connectClient(config: MCPConfig): Promise { `${process.env.HOME}/Library/Android/sdk`; const transport = new StdioClientTransport({ - command: 'npx', - // --yes: auto-confirm installation without prompting (avoids consuming MCP stdin as "y/n" answer) - args: ['--yes', 'appium-mcp@1.49.1'], + command: appiumMcp.command, + args: appiumMcp.args, env: { ...process.env, ANDROID_HOME: androidHome, diff --git a/src/mcp/session-client.ts b/src/mcp/session-client.ts index c7cccef..73fc8cc 100644 --- a/src/mcp/session-client.ts +++ b/src/mcp/session-client.ts @@ -17,11 +17,9 @@ import type { MCPClient, MCPToolResult, MCPToolInfo } from './types.js'; * These must NOT receive a sessionId injection. */ const PRE_SESSION_TOOLS = new Set([ - 'create_session', - 'select_platform', + 'appium_session_management', 'select_device', 'delete_all_sessions', - 'list_sessions', ]); export class SessionScopedMCPClient implements MCPClient { diff --git a/src/mcp/tool-converter.ts b/src/mcp/tool-converter.ts index 8672966..c6ea8fa 100644 --- a/src/mcp/tool-converter.ts +++ b/src/mcp/tool-converter.ts @@ -39,11 +39,7 @@ export function convertMCPToolsToAITools( /** MCP tools the agent should never call directly */ export const EXCLUDED_MCP_TOOLS = new Set([ - 'create_session', - 'delete_session', - 'list_sessions', - 'selectSession', - 'select_platform', + 'appium_session_management', 'select_device', 'prepare_ios_simulator', // AI code-gen tools — not relevant to device control @@ -51,6 +47,9 @@ export const EXCLUDED_MCP_TOOLS = new Set([ 'appium_generate_locators', 'generate_tests', 'generate_locators', + // Documentation/skills tools — not relevant to device control + 'appium_documentation_query', + 'appium_skills', ]); /** Additional tools to exclude in vision mode — DOM-based tools that distract the agent */ diff --git a/src/memory/fingerprint.ts b/src/memory/fingerprint.ts index 639e196..418d80c 100644 --- a/src/memory/fingerprint.ts +++ b/src/memory/fingerprint.ts @@ -133,10 +133,34 @@ export function extractGoalKeywords(goal: string): string[] { * Android DOM elements have `rid="com.foo.bar:id/xyz"` — extract the package prefix. * Returns undefined if no package can be detected. */ +/** iOS XCUITest app name → bundle ID for known apps */ +const IOS_APP_NAME_TO_BUNDLE_ID: Record = { + Gmail: 'com.google.gmail', + YouTube: 'com.google.ios.youtube', + WhatsApp: 'net.whatsapp.WhatsApp', + Chrome: 'com.google.chrome', + Settings: 'com.apple.Preferences', + Safari: 'com.apple.mobilesafari', + Messages: 'com.apple.MobileSMS', + Maps: 'com.apple.Maps', + Instagram: 'com.burbn.instagram', + Spotify: 'com.spotify.client', + Twitter: 'com.atebits.Tweetie2', + X: 'com.atebits.Tweetie2', +}; + export function extractAppIdFromDom(dom: string): string | undefined { if (!dom) return undefined; - const match = dom.match(/rid="([a-z][a-z0-9_.]*):id\//); - return match?.[1]; + + // Android: resource ID prefix e.g. rid="com.google.android.gm:id/..." + const androidMatch = dom.match(/rid="([a-z][a-z0-9_.]*):id\//); + if (androidMatch) return androidMatch[1]; + + // iOS: XCUIElementTypeApplication name attribute e.g. name="Gmail" + const iosMatch = dom.match(/XCUIElementTypeApplication[^>]*\sname="([^"]+)"/); + if (iosMatch) return IOS_APP_NAME_TO_BUNDLE_ID[iosMatch[1]]; + + return undefined; } /** diff --git a/src/playground/index.ts b/src/playground/index.ts index 42d424b..8721496 100644 --- a/src/playground/index.ts +++ b/src/playground/index.ts @@ -84,6 +84,8 @@ function stepAction(step: FlowStep): string { return 'open'; case 'tap': return 'tap'; + case 'longPress': + return 'longpress'; case 'type': return 'type'; case 'swipe': @@ -120,6 +122,8 @@ function stepTarget(step: FlowStep): string { return step.query; case 'tap': return `"${step.label}"`; + case 'longPress': + return `"${step.label}"${step.duration != null ? ` (${step.duration}ms)` : ''}`; case 'type': return `"${step.text}"${step.target ? ` → ${step.target}` : ''}`; case 'swipe': @@ -160,6 +164,8 @@ function spinnerDetail(step: FlowStep): string { switch (step.kind) { case 'tap': return 'tapping the screen…'; + case 'longPress': + return 'long-pressing the screen…'; case 'type': return 'typing into the field…'; case 'swipe': @@ -246,6 +252,10 @@ function stepToYaml(step: FlowStep): unknown { return `open ${step.query} app`; case 'tap': return `tap ${step.label}`; + case 'longPress': + return step.duration != null + ? `long press ${step.label} for ${step.duration}ms` + : `long press ${step.label}`; case 'type': return `type "${step.text}"`; case 'swipe': @@ -587,6 +597,15 @@ function printHelp(): void { 'navigate to Settings screen', ], }, + { + category: 'Long Press', + lines: [ + 'long press on first email', + 'long-press the image', + 'press and hold Delete button', + 'long press on file for 1500ms', + ], + }, { category: 'Type & Search', lines: [ @@ -706,6 +725,7 @@ async function connectToDevice(): Promise { cliUdid: _deviceArgs.udid ?? null, cliDeviceName: _deviceArgs.deviceName ?? null, config, + alwaysPickDevice: true, }); _resolvedPlatform = deviceResult.platform; @@ -1199,7 +1219,7 @@ export async function runPlayground(deviceArgs?: PlaygroundDeviceArgs): Promise< async function cleanup(): Promise { if (state.mcp) { try { - await state.mcp.callTool('delete_session', {}); + await state.mcp.callTool('appium_session_management', { action: 'delete' }); } catch { /* ignore — session may already be gone */ } @@ -1389,6 +1409,39 @@ async function processLine(line: string): Promise { } } + // ── Regex fast path: try parsing without LLM first ── + const regexParsed = tryParseNaturalFlowLine(line); + if (regexParsed) { + const stepNum = state.steps.length + 1; + if (regexParsed.kind === 'done') { + state.steps.push(regexParsed); + printStepSuccess(stepNum, regexParsed, 'recorded'); + return; + } + if (regexParsed.kind === 'getInfo') { + await handleGetInfo(regexParsed.query); + return; + } + ui.startSpinner(`[${stepNum}] ${regexParsed.kind}`, spinnerDetail(regexParsed)); + resetVisionTokens(); + try { + const result = await runStepOnDevice(regexParsed); + ui.stopSpinner(); + if (result.success) { + state.steps.push(regexParsed); + printStepSuccess(stepNum, regexParsed, result.message); + } else { + printStepFail(stepNum, regexParsed, result.message); + console.log(` ${theme.dim('Step not recorded. Fix and try again.')}`); + } + } catch (err: any) { + ui.stopSpinner(); + printStepFail(stepNum, regexParsed, err?.message ?? String(err)); + console.log(` ${theme.dim('Step not recorded. Fix and try again.')}`); + } + return; + } + // ── Two-call fallback: classify via LLM → execute via step runner ── let parsed: FlowStep; let classifyUsage: { inputTokens: number; outputTokens: number; totalTokens: number } | undefined; diff --git a/src/recording/replayer.ts b/src/recording/replayer.ts index 3d55bde..b51559b 100644 --- a/src/recording/replayer.ts +++ b/src/recording/replayer.ts @@ -219,7 +219,7 @@ async function executeReplayAction( const uuid = await findElementWithFallback(mcp, screenElements, elementId, coords); if (uuid) { - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); return { success: true, message: `Tapped ${elementId || `[${coordX}, ${coordY}]`}` }; } @@ -239,7 +239,7 @@ async function executeReplayAction( } } } else { - await mcp.callTool('appium_click', { elementUUID: visionUuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: visionUuid }); return { success: true, message: `Tapped "${visionDescription}" via AI vision` }; } } @@ -272,7 +272,7 @@ async function executeReplayAction( // Target is directly editable — use it const uuid = await findElementWithFallback(mcp, screenElements, elementId, coords); if (uuid) { - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); await mcp.callTool('appium_clear_element', { elementUUID: uuid }).catch(() => {}); await mcp.callTool('appium_set_value', { elementUUID: uuid, text }); return { success: true, message: `Typed "${text}"` }; @@ -283,7 +283,7 @@ async function executeReplayAction( // page source to find the actual editable element const clickUuid = await findElementWithFallback(mcp, screenElements, elementId, coords); if (clickUuid) { - await mcp.callTool('appium_click', { elementUUID: clickUuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: clickUuid }); } // Re-read page source to discover the real editable element @@ -309,7 +309,7 @@ async function executeReplayAction( return { success: false, message: `Could not find an editable input near ${target}` }; } - await mcp.callTool('appium_click', { elementUUID: typeUuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: typeUuid }); await mcp.callTool('appium_clear_element', { elementUUID: typeUuid }).catch(() => {}); await mcp.callTool('appium_set_value', { elementUUID: typeUuid, text }); return { success: true, message: `Typed "${text}"` }; diff --git a/src/skills/find-and-tap.ts b/src/skills/find-and-tap.ts index 99a393f..615e553 100644 --- a/src/skills/find-and-tap.ts +++ b/src/skills/find-and-tap.ts @@ -50,14 +50,15 @@ export async function findAndTap( try { // First try: use appium-mcp's scroll_to_element with accessibility id try { - await mcp.callTool('appium_scroll_to_element', { + await mcp.callTool('appium_gesture', { + action: 'scroll_to_element', strategy: 'accessibility id', selector: query, direction, maxScrolls, }); const uuid = await findElement(mcp, 'accessibility id', query); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); return { success: true, message: `Found and tapped "${query}" via accessibility ID` }; } catch { // Fall through @@ -65,14 +66,15 @@ export async function findAndTap( // Second try: resource id strategy try { - await mcp.callTool('appium_scroll_to_element', { + await mcp.callTool('appium_gesture', { + action: 'scroll_to_element', strategy: 'id', selector: query, direction, maxScrolls, }); const uuid = await findElement(mcp, 'id', query); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); return { success: true, message: `Found and tapped "${query}" via resource ID` }; } catch { // Fall through @@ -93,7 +95,7 @@ export async function findAndTap( if (match.accessibilityId) { try { const uuid = await findElement(mcp, 'accessibility id', match.accessibilityId); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); return { success: true, message: `Found and tapped "${query}" (accessibility id: ${match.accessibilityId})`, @@ -104,7 +106,7 @@ export async function findAndTap( try { const uuid = await findElement(mcp, 'id', match.accessibilityId); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); return { success: true, message: `Found and tapped "${query}" (resource id: ${match.accessibilityId})`, @@ -122,7 +124,7 @@ export async function findAndTap( // Not found — scroll and try again if (i < maxScrolls) { - await mcp.callTool('appium_scroll', { direction }); + await mcp.callTool('appium_gesture', { action: 'scroll', direction }); await sleep(500); } } @@ -132,7 +134,7 @@ export async function findAndTap( try { const visionUuid = await findElementByVision(mcp, query); // Pass UUID directly to appium_click — it handles ai-element: UUIDs natively - await mcp.callTool('appium_click', { elementUUID: visionUuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: visionUuid }); const coords = parseAIElementCoords(visionUuid); const coordInfo = coords ? ` at [${coords.x},${coords.y}]` : ''; return { success: true, message: `Found and tapped "${query}" via AI vision${coordInfo}` }; diff --git a/src/skills/read-screen.ts b/src/skills/read-screen.ts index 641ea0f..9528f1f 100644 --- a/src/skills/read-screen.ts +++ b/src/skills/read-screen.ts @@ -49,7 +49,7 @@ export async function readScreen(mcp: MCPClient, maxScrolls: number = 5): Promis // Scroll down for more content (skip on last iteration) if (i < maxScrolls) { - await mcp.callTool('appium_scroll', { direction: 'down' }); + await mcp.callTool('appium_gesture', { action: 'scroll', direction: 'down' }); scrollCount++; await sleep(500); } diff --git a/src/skills/submit-message.ts b/src/skills/submit-message.ts index 151e42b..947e445 100644 --- a/src/skills/submit-message.ts +++ b/src/skills/submit-message.ts @@ -112,7 +112,7 @@ export async function submitMessage(mcp: MCPClient): Promise { if (sendButton.accessibilityId) { try { const uuid = await findElement(mcp, 'accessibility id', sendButton.accessibilityId); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); await sleep(1000); return { success: true, message: `Tapped Send button (${sendButton.accessibilityId})` }; } catch { @@ -123,7 +123,7 @@ export async function submitMessage(mcp: MCPClient): Promise { if (sendButton.id) { try { const uuid = await findElement(mcp, 'id', sendButton.id); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); await sleep(1000); return { success: true, message: `Tapped Send button (${sendButton.id})` }; } catch { @@ -135,7 +135,7 @@ export async function submitMessage(mcp: MCPClient): Promise { try { const xpathQuery = `//*[@text='${sendButton.text}' or @content-desc='${sendButton.text}']`; const uuid = await findElement(mcp, 'xpath', xpathQuery); - await mcp.callTool('appium_click', { elementUUID: uuid }); + await mcp.callTool('appium_gesture', { action: 'tap', elementUUID: uuid }); await sleep(1000); return { success: true, message: `Tapped Send button via xpath` }; } catch { diff --git a/src/ui/terminal.ts b/src/ui/terminal.ts index c15cff8..316864e 100644 --- a/src/ui/terminal.ts +++ b/src/ui/terminal.ts @@ -254,6 +254,71 @@ let spinnerFrame = 0; let spinnerLineActive = false; let spinnerPrimary = ''; let spinnerDetail: string | undefined; +let wordRotateTimer: ReturnType | null = null; + +/** Fun verbs shown while the agent is thinking — rotated randomly. */ +const THINKING_VERBS = [ + 'Brewing…', + 'Cascading…', + 'Channeling…', + 'Choreographing…', + 'Churning…', + 'Coalescing…', + 'Cogitating…', + 'Composing…', + 'Computing…', + 'Concocting…', + 'Considering…', + 'Contemplating…', + 'Cooking…', + 'Crafting…', + 'Crunching…', + 'Crystallizing…', + 'Cultivating…', + 'Deciphering…', + 'Deliberating…', + 'Determining…', + 'Elucidating…', + 'Envisioning…', + 'Fermenting…', + 'Forging…', + 'Forming…', + 'Generating…', + 'Germinating…', + 'Harmonizing…', + 'Hatching…', + 'Ideating…', + 'Imagining…', + 'Incubating…', + 'Inferring…', + 'Manifesting…', + 'Marinating…', + 'Mulling…', + 'Musing…', + 'Noodling…', + 'Orchestrating…', + 'Percolating…', + 'Pondering…', + 'Processing…', + 'Ruminating…', + 'Simmering…', + 'Sketching…', + 'Spinning…', + 'Sprouting…', + 'Synthesizing…', + 'Tinkering…', + 'Unravelling…', + 'Vibing…', + 'Whirring…', + 'Whisking…', + 'Working…', + 'Wrangling…', + 'Zesting…', +]; + +function pickThinkingVerb(): string { + return THINKING_VERBS[Math.floor(Math.random() * THINKING_VERBS.length)]; +} function paintSpinnerLine(frame: number, overwrite: boolean): void { const sym = theme.brand(SPINNER.frames[frame % SPINNER.frames.length]); @@ -267,12 +332,17 @@ function paintSpinnerLine(frame: number, overwrite: boolean): void { process.stdout.write(line); } -export function formatAgentThinkingDetail(modelName: string): string { +export function formatAgentThinkingDetail( + modelName: string, + step?: number, + maxSteps?: number +): string { const mode = Config.AGENT_MODE === 'vision' ? 'vision' : 'dom'; const think = Config.LLM_THINKING === 'on' ? 'thinking on' : 'thinking off'; const m = modelName.trim() || 'model'; const short = m.length > 40 ? `${m.slice(0, 37)}…` : m; - return `${mode} · ${think} · ${short}`; + const stepStr = step != null && maxSteps != null ? `${step}/${maxSteps} · ` : ''; + return `${stepStr}${mode} · ${think} · ${short}`; } export function printAgentBullet(message: string): void { @@ -286,11 +356,11 @@ export function updateSpinner(message?: string, detail?: string): void { paintSpinnerLine(spinnerFrame, true); } -export function startSpinner(message: string, detail?: string): void { +export function startSpinner(message: string, detail?: string, rotateWords = false): void { stopSpinner(); spinnerFrame = 0; spinnerLineActive = true; - spinnerPrimary = message; + spinnerPrimary = rotateWords ? pickThinkingVerb() : message; spinnerDetail = detail; process.stdout.write('\x1B[?25l'); paintSpinnerLine(spinnerFrame, false); @@ -298,9 +368,19 @@ export function startSpinner(message: string, detail?: string): void { spinnerFrame = (spinnerFrame + 1) % SPINNER.frames.length; paintSpinnerLine(spinnerFrame, true); }, SPINNER.interval); + if (rotateWords) { + wordRotateTimer = setInterval(() => { + spinnerPrimary = pickThinkingVerb(); + paintSpinnerLine(spinnerFrame, true); + }, 2500); + } } export function stopSpinner(finalMessage?: string): void { + if (wordRotateTimer) { + clearInterval(wordRotateTimer); + wordRotateTimer = null; + } if (spinnerTimer) { clearInterval(spinnerTimer); spinnerTimer = null; @@ -493,7 +573,7 @@ export function printGoalStart(goal: string, maxSteps: number): void { const content = [ ...wrapped.map((l) => chalk.bold(l)), '', - `${theme.dim(`max ${maxSteps} steps`)} ${progressBar(0, maxSteps, 15)}`, + theme.dim(`max ${maxSteps} steps`), ].join('\n'); console.log(); diff --git a/src/vision/window-size.ts b/src/vision/window-size.ts index 5588375..c5b3963 100644 --- a/src/vision/window-size.ts +++ b/src/vision/window-size.ts @@ -117,9 +117,9 @@ export async function getScreenSizeForStark( } } - // 2. appium_mobile_get_device_info — Android: realDisplaySize in physical pixels + // 2. appium_mobile_device_info — Android: realDisplaySize in physical pixels try { - const result = await mcp.callTool('appium_mobile_get_device_info', {}); + const result = await mcp.callTool('appium_mobile_device_info', {}); const text = mcpResultText(result); const sizeMatch = text.match(/realDisplaySize['":\s]+(\d+)x(\d+)/i); if (sizeMatch) {