From ca9d0b1320a119cabdc35c05053af69f8b905952 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 31 Mar 2026 11:48:56 -0700 Subject: [PATCH 01/11] feat!: migrate CLI to scrapegraph-js v2 API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the CLI with ScrapeGraphAI/scrapegraph-js#11 (v2 SDK migration): - Rename smart-scraper β†’ extract, search-scraper β†’ search - Remove commands dropped from the API: agentic-scraper, generate-schema, sitemap, validate - Add client factory (src/lib/client.ts) using the new scrapegraphai({ apiKey }) pattern - Update scrape command with --format flag (markdown, html, screenshot, branding) - Update crawl to use crawl.start/status polling lifecycle - Update history to use v2 service names and parameters - All commands now use try/catch (v2 throws on error) and self-timed elapsed BREAKING CHANGE: CLI commands have been renamed and removed to match the v2 API surface. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 281 ++++++++++++-------------------- bun.lock | 4 +- package.json | 4 +- src/cli.ts | 10 +- src/commands/agentic-scraper.ts | 51 ------ src/commands/crawl.ts | 78 +++++---- src/commands/credits.ts | 19 ++- src/commands/extract.ts | 57 +++++++ src/commands/generate-schema.ts | 37 ----- src/commands/history.ts | 144 ++++++++-------- src/commands/markdownify.ts | 31 ++-- src/commands/scrape.ts | 41 +++-- src/commands/search-scraper.ts | 52 ------ src/commands/search.ts | 51 ++++++ src/commands/sitemap.ts | 31 ---- src/commands/smart-scraper.ts | 57 ------- src/commands/validate.ts | 25 --- src/lib/client.ts | 18 ++ src/utils/banner.ts | 4 +- 19 files changed, 407 insertions(+), 588 deletions(-) delete mode 100644 src/commands/agentic-scraper.ts create mode 100644 src/commands/extract.ts delete mode 100644 src/commands/generate-schema.ts delete mode 100644 src/commands/search-scraper.ts create mode 100644 src/commands/search.ts delete mode 100644 src/commands/sitemap.ts delete mode 100644 src/commands/smart-scraper.ts delete mode 100644 src/commands/validate.ts create mode 100644 src/lib/client.ts diff --git a/README.md b/README.md index 5871dea..05b9eb0 100644 --- a/README.md +++ b/README.md @@ -5,40 +5,27 @@ Made with love by the [ScrapeGraphAI team](https://scrapegraphai.com) πŸ’œ ![Demo Video](/assets/demo.gif) -Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. +Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. Uses the **v2 API**. ## Project Structure ``` just-scrape/ -β”œβ”€β”€ docs/ # API response docs per endpoint -β”‚ β”œβ”€β”€ smartscraper.md -β”‚ β”œβ”€β”€ searchscraper.md -β”‚ β”œβ”€β”€ markdownify.md -β”‚ β”œβ”€β”€ crawl.md -β”‚ β”œβ”€β”€ scrape.md -β”‚ β”œβ”€β”€ agenticscraper.md -β”‚ β”œβ”€β”€ generate-schema.md -β”‚ β”œβ”€β”€ sitemap.md -β”‚ └── credits.md β”œβ”€β”€ src/ β”‚ β”œβ”€β”€ cli.ts # Entry point, citty main command + subcommands β”‚ β”œβ”€β”€ lib/ +β”‚ β”‚ β”œβ”€β”€ client.ts # ScrapeGraphAI v2 client factory β”‚ β”‚ β”œβ”€β”€ env.ts # Env config (API key, JUST_SCRAPE_* β†’ SGAI_* bridge) β”‚ β”‚ β”œβ”€β”€ folders.ts # API key resolution + interactive prompt β”‚ β”‚ └── log.ts # Logger factory + syntax-highlighted JSON output β”‚ β”œβ”€β”€ commands/ -β”‚ β”‚ β”œβ”€β”€ smart-scraper.ts -β”‚ β”‚ β”œβ”€β”€ search-scraper.ts +β”‚ β”‚ β”œβ”€β”€ extract.ts +β”‚ β”‚ β”œβ”€β”€ search.ts +β”‚ β”‚ β”œβ”€β”€ scrape.ts β”‚ β”‚ β”œβ”€β”€ markdownify.ts β”‚ β”‚ β”œβ”€β”€ crawl.ts -β”‚ β”‚ β”œβ”€β”€ sitemap.ts -β”‚ β”‚ β”œβ”€β”€ scrape.ts -β”‚ β”‚ β”œβ”€β”€ agentic-scraper.ts -β”‚ β”‚ β”œβ”€β”€ generate-schema.ts β”‚ β”‚ β”œβ”€β”€ history.ts -β”‚ β”‚ β”œβ”€β”€ credits.ts -β”‚ β”‚ └── validate.ts +β”‚ β”‚ └── credits.ts β”‚ └── utils/ β”‚ └── banner.ts # ASCII banner + version from package.json β”œβ”€β”€ dist/ # Build output (git-ignored) @@ -90,264 +77,190 @@ Four ways to provide it (checked in order): | Variable | Description | Default | |---|---|---| | `SGAI_API_KEY` | ScrapeGraph API key | β€” | -| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/v1` | -| `JUST_SCRAPE_TIMEOUT_S` | Request/polling timeout in seconds | `120` | -| `JUST_SCRAPE_DEBUG` | Set to `1` to enable debug logging to stderr | `0` | +| `SGAI_API_URL` | Override API base URL | `https://api.scrapegraphai.com` | +| `SGAI_TIMEOUT_S` | Request timeout in seconds | `30` | + +Legacy variables (`JUST_SCRAPE_API_URL`, `JUST_SCRAPE_TIMEOUT_S`, `JUST_SCRAPE_DEBUG`) are still bridged. ## JSON Mode (`--json`) All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed β€” only minified JSON on stdout (saves tokens when piped to AI agents). ```bash -just-scrape credits --json | jq '.remaining_credits' -just-scrape smart-scraper https://example.com -p "Extract data" --json > result.json -just-scrape history smartscraper --json | jq '.requests[].status' +just-scrape credits --json | jq '.remainingCredits' +just-scrape extract https://example.com -p "Extract data" --json > result.json +just-scrape history extract --json | jq '.[].status' ``` --- -## Smart Scraper +## Extract -Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/services/smartscraper) +Extract structured data from any URL using AI (replaces `smart-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/extract) ### Usage ```bash -just-scrape smart-scraper -p # Extract data with AI -just-scrape smart-scraper -p --schema # Enforce output schema -just-scrape smart-scraper -p --scrolls # Infinite scroll (0-100) -just-scrape smart-scraper -p --pages # Multi-page (1-100) -just-scrape smart-scraper -p --stealth # Anti-bot bypass (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text # Plain text instead of JSON +just-scrape extract -p # Extract data with AI +just-scrape extract -p --schema # Enforce output schema +just-scrape extract -p --scrolls # Infinite scroll (0-100) +just-scrape extract -p --stealth # Anti-bot bypass (+4 credits) +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # Geo-targeting ``` ### Examples ```bash # Extract product listings from an e-commerce page -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" +just-scrape extract https://store.example.com/shoes -p "Extract all product names, prices, and ratings" # Extract with a strict schema, scrolling to load more content -just-scrape smart-scraper https://news.example.com -p "Get all article headlines and dates" \ +just-scrape extract https://news.example.com -p "Get all article headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 # Scrape a JS-heavy SPA behind anti-bot protection -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ +just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ --stealth ``` -## Search Scraper +## Search -Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/services/searchscraper) +Search the web and extract structured data from results (replaces `search-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/search) ### Usage ```bash -just-scrape search-scraper # AI-powered web search -just-scrape search-scraper --num-results # Sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # Markdown only (2 credits vs 10) -just-scrape search-scraper --schema # Enforce output schema -just-scrape search-scraper --stealth --headers +just-scrape search # AI-powered web search +just-scrape search --num-results # Sources to scrape (1-20, default 3) +just-scrape search -p # Extraction prompt for results +just-scrape search --schema # Enforce output schema +just-scrape search --headers ``` ### Examples ```bash # Research a topic across multiple sources -just-scrape search-scraper "What are the best Python web frameworks in 2025?" --num-results 10 - -# Get raw markdown from search results (cheaper) -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 +just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10 # Structured output with schema -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -## Markdownify - -Convert any webpage to clean markdown. [docs](https://docs.scrapegraphai.com/services/markdownify) - -### Usage - -```bash -just-scrape markdownify # Convert to markdown -just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) -just-scrape markdownify --headers # Custom headers -``` - -### Examples - -```bash -# Convert a blog post to markdown -just-scrape markdownify https://blog.example.com/my-article - -# Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com --stealth - -# Pipe markdown to a file -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md -``` - -## Crawl +## Scrape -Crawl multiple pages and extract data from each. [docs](https://docs.scrapegraphai.com/services/smartcrawler) +Scrape content from a URL in various formats: markdown (default), html, screenshot, or branding. [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape crawl -p # Crawl + extract -just-scrape crawl -p --max-pages # Max pages (default 10) -just-scrape crawl -p --depth # Crawl depth (default 1) -just-scrape crawl --no-extraction --max-pages # Markdown only (2 credits/page) -just-scrape crawl -p --schema # Enforce output schema -just-scrape crawl -p --rules # Crawl rules (include_paths, same_domain) -just-scrape crawl -p --no-sitemap # Skip sitemap discovery -just-scrape crawl -p --stealth # Anti-bot bypass +just-scrape scrape # Markdown (default) +just-scrape scrape -f html # Raw HTML +just-scrape scrape -f screenshot # Screenshot +just-scrape scrape -f branding # Extract branding info +just-scrape scrape --stealth # Anti-bot bypass (+4 credits) +just-scrape scrape --country # Geo-targeting ``` ### Examples ```bash -# Crawl a docs site and extract all code examples -just-scrape crawl https://docs.example.com -p "Extract all code snippets with their language" \ - --max-pages 20 --depth 3 - -# Crawl only blog pages, skip everything else -just-scrape crawl https://example.com -p "Extract article titles and summaries" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 - -# Get raw markdown from all pages (no AI extraction, cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 -``` - -## Sitemap - -Get all URLs from a website's sitemap. [docs](https://docs.scrapegraphai.com/services/sitemap) - -### Usage - -```bash -just-scrape sitemap -``` +# Get markdown of a page +just-scrape scrape https://example.com -### Examples +# Get raw HTML +just-scrape scrape https://example.com -f html -```bash -# List all pages on a site -just-scrape sitemap https://example.com +# Scrape with anti-bot bypass and geo-targeting +just-scrape scrape https://store.example.com --stealth --country DE -# Pipe URLs to another tool -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Extract branding info (logos, colors, fonts) +just-scrape scrape https://example.com -f branding ``` -## Scrape +## Markdownify -Get raw HTML content from a URL. [docs](https://docs.scrapegraphai.com/services/scrape) +Convert any webpage to clean markdown (convenience wrapper for `scrape --format markdown`). [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape scrape # Raw HTML -just-scrape scrape --stealth # Anti-bot bypass (+4 credits) -just-scrape scrape --branding # Extract branding (+2 credits) -just-scrape scrape --country-code # Geo-targeting +just-scrape markdownify # Convert to markdown +just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) +just-scrape markdownify --headers # Custom headers ``` ### Examples ```bash -# Get raw HTML of a page -just-scrape scrape https://example.com +# Convert a blog post to markdown +just-scrape markdownify https://blog.example.com/my-article -# Scrape a geo-restricted page with anti-bot bypass -just-scrape scrape https://store.example.com --stealth --country-code DE +# Convert a JS-rendered page behind Cloudflare +just-scrape markdownify https://protected.example.com --stealth -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com --branding +# Pipe markdown to a file +just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md ``` -## Agentic Scraper +## Crawl -Browser automation with AI β€” login, click, navigate, fill forms. [docs](https://docs.scrapegraphai.com/services/agenticscraper) +Crawl multiple pages. The CLI starts the crawl and polls until completion. [docs](https://docs.scrapegraphai.com/api-reference/crawl) ### Usage ```bash -just-scrape agentic-scraper -s # Run browser steps -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # Persist browser session +just-scrape crawl # Crawl with defaults +just-scrape crawl --max-pages # Max pages (default 50) +just-scrape crawl --max-depth # Crawl depth (default 2) +just-scrape crawl --max-links-per-page # Links per page (default 10) +just-scrape crawl --allow-external # Allow external domains +just-scrape crawl --stealth # Anti-bot bypass ``` ### Examples ```bash -# Log in and extract dashboard data -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Navigate through a multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across multiple runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session -``` - -## Generate Schema - -Generate a JSON schema from a natural language description. - -### Usage - -```bash -just-scrape generate-schema # AI generates a schema -just-scrape generate-schema --existing-schema -``` +# Crawl a docs site +just-scrape crawl https://docs.example.com --max-pages 20 --max-depth 3 -### Examples - -```bash -# Generate a schema for product data -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" +# Crawl staying within domain +just-scrape crawl https://example.com --max-pages 50 -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' +# Get crawl results as JSON +just-scrape crawl https://example.com --json --max-pages 10 ``` ## History -Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for infinite scroll. +Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for pagination. ### Usage ```bash -just-scrape history # Interactive browser -just-scrape history # Fetch specific request -just-scrape history --page # Start from page (default 1) -just-scrape history --page-size # Results per page (default 10, max 100) -just-scrape history --json # Raw JSON (pipeable) +just-scrape history # Interactive browser +just-scrape history # Fetch specific request +just-scrape history --page # Start from page (default 1) +just-scrape history --page-size # Results per page (default 20, max 100) +just-scrape history --json # Raw JSON (pipeable) ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `search`, `monitor`, `crawl` ### Examples ```bash -# Browse your smart-scraper history interactively -just-scrape history smartscraper +# Browse your extract history interactively +just-scrape history extract # Jump to a specific request by ID -just-scrape history smartscraper abc123-def456-7890 +just-scrape history extract abc123-def456-7890 # Export crawl history as JSON -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +just-scrape history crawl --json --page-size 100 | jq '.[].status' ``` ## Credits @@ -356,18 +269,26 @@ Check your credit balance. ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' +just-scrape credits --json | jq '.remainingCredits' ``` -## Validate +--- -Validate your API key (health check). +## Migration from v0.2.x -```bash -just-scrape validate -``` +Commands have been renamed to match the v2 API: ---- +| Old command | New command | Notes | +|---|---|---| +| `smart-scraper` | `extract` | Renamed | +| `search-scraper` | `search` | Renamed | +| `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | +| `scrape` | `scrape` | Gains `--format` flag (markdown, html, screenshot, branding) | +| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external` | +| `agentic-scraper` | β€” | Removed from API | +| `generate-schema` | β€” | Removed from API | +| `sitemap` | β€” | Removed from API | +| `validate` | β€” | Removed from API | ## Contributing @@ -392,7 +313,7 @@ bun run dev --help | CLI Framework | **citty** (unjs) | | Prompts | **@clack/prompts** | | Styling | **chalk** v5 (ESM) | -| SDK | **scrapegraph-js** | +| SDK | **scrapegraph-js** v2 | | Env | **dotenv** | | Lint / Format | **Biome** | | Target | **Node.js 22+**, ESM-only | diff --git a/bun.lock b/bun.lock index 5a7bd89..1732297 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@1.0.0", "", {}, "sha512-eQn8/HRfJHjCoj2yia5yHWQTYUae/bYNhLEx00ZXF+GLKpgUJT0OCGUQM13WGSX5cgw9onz5EiaDJDbzcbeYtQ=="], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#4b86432", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-4b86432"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], diff --git a/package.json b/package.json index 55c9e7b..2d8cd18 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "just-scrape", - "version": "0.2.1", + "version": "0.3.0", "description": "ScrapeGraph AI CLI tool", "type": "module", "main": "dist/cli.mjs", @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/cli.ts b/src/cli.ts index 483a94c..255e93a 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -12,17 +12,13 @@ const main = defineCommand({ description: "ScrapeGraph AI CLI tool", }, subCommands: { - "smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default), - "search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default), + extract: () => import("./commands/extract.js").then((m) => m.default), + search: () => import("./commands/search.js").then((m) => m.default), + scrape: () => import("./commands/scrape.js").then((m) => m.default), markdownify: () => import("./commands/markdownify.js").then((m) => m.default), crawl: () => import("./commands/crawl.js").then((m) => m.default), - sitemap: () => import("./commands/sitemap.js").then((m) => m.default), - scrape: () => import("./commands/scrape.js").then((m) => m.default), - "agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default), - "generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default), history: () => import("./commands/history.js").then((m) => m.default), credits: () => import("./commands/credits.js").then((m) => m.default), - validate: () => import("./commands/validate.js").then((m) => m.default), }, }); diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts deleted file mode 100644 index 67e9b5b..0000000 --- a/src/commands/agentic-scraper.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "agentic-scraper", - description: "Browser automation with AI (login, click, navigate, fill forms)", - }, - args: { - url: { - type: "positional", - description: "Starting URL", - required: true, - }, - steps: { - type: "string", - alias: "s", - description: 'Comma-separated browser steps (e.g. "Click login,Fill email with x")', - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (used with --ai-extraction)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - "ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" }, - "use-session": { type: "boolean", description: "Persist browser session across requests" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); - const key = await resolveApiKey(!!args.json); - - const steps = args.steps ? args.steps.split(",").map((s) => s.trim()) : []; - const params: scrapegraphai.AgenticScraperParams = { url: args.url, steps }; - if (args.prompt) params.user_prompt = args.prompt; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args["ai-extraction"]) params.ai_extraction = true; - if (args["use-session"]) params.use_session = true; - - out.start("Running browser automation"); - const result = await scrapegraphai.agenticScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index d55101d..3b23357 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,8 +1,9 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; +const POLL_INTERVAL_MS = 3000; + export default defineCommand({ meta: { name: "crawl", @@ -14,49 +15,54 @@ export default defineCommand({ description: "Starting URL to crawl", required: true, }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (required when extraction mode is on)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/page instead of 10)", - }, - "max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" }, - depth: { type: "string", description: "Crawl depth (default 1)" }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - rules: { type: "string", description: "Crawl rules as JSON object string" }, - "no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" }, + "max-pages": { type: "string", description: "Maximum pages to crawl (default 50)" }, + "max-depth": { type: "string", description: "Crawl depth (default 2)" }, + "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, + "allow-external": { type: "boolean", description: "Allow crawling external domains" }, stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); + const sgai = await createClient(!!args.json); - const base: Record = { url: args.url }; - if (args["max-pages"]) base.max_pages = Number(args["max-pages"]); - if (args.depth) base.depth = Number(args.depth); - if (args.rules) base.rules = JSON.parse(args.rules); - if (args["no-sitemap"]) base.sitemap = false; - if (args.stealth) base.stealth = true; + const crawlOptions: Record = {}; + if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]); + if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]); + if (args["max-links-per-page"]) + crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); + if (args["allow-external"]) crawlOptions.allowExternal = true; + if (args.stealth) crawlOptions.fetchConfig = { stealth: true }; - if (args["no-extraction"]) { - base.extraction_mode = false; - } else { - if (args.prompt) base.prompt = args.prompt; - if (args.schema) base.schema = JSON.parse(args.schema); - } + out.start("Crawling"); + const t0 = performance.now(); + try { + const job = await sgai.crawl.start(args.url, crawlOptions as any); + const jobId = (job.data as { id: string }).id; - const params = base as scrapegraphai.CrawlParams; + if (!jobId) { + out.stop(Math.round(performance.now() - t0)); + out.result(job.data); + return; + } - out.start("Crawling"); - const result = await scrapegraphai.crawl(key, params, out.poll); - out.stop(result.elapsedMs); + // Poll until the crawl completes + while (true) { + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + const status = await sgai.crawl.status(jobId); + const statusData = status.data as { status: string; [key: string]: unknown }; + out.poll(statusData.status); - if (result.data) out.result(result.data); - else out.error(result.error); + if (statusData.status === "completed" || statusData.status === "failed" || statusData.status === "cancelled") { + out.stop(Math.round(performance.now() - t0)); + out.result(status.data); + return; + } + } + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 0d7b75f..457e27d 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,6 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -13,13 +12,17 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); + const sgai = await createClient(!!args.json); out.start("Fetching credits"); - const result = await scrapegraphai.getCredits(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.credits(); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/extract.ts b/src/commands/extract.ts new file mode 100644 index 0000000..bb0be8f --- /dev/null +++ b/src/commands/extract.ts @@ -0,0 +1,57 @@ +import { defineCommand } from "citty"; +import { createClient } from "../lib/client.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "extract", + description: "Extract structured data from a URL using AI", + }, + args: { + url: { + type: "positional", + description: "Website URL to scrape", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt", + required: true, + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + cookies: { type: "string", description: "Cookies as JSON object string" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/extract"); + const sgai = await createClient(!!args.json); + + const fetchConfig: Record = {}; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.stealth) fetchConfig.stealth = true; + if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + if (args.country) fetchConfig.country = args.country; + + const extractOptions: Record = { prompt: args.prompt }; + if (args.schema) extractOptions.schema = JSON.parse(args.schema); + if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig; + + out.start("Extracting"); + const t0 = performance.now(); + try { + const result = await sgai.extract(args.url, extractOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts deleted file mode 100644 index 8d77e57..0000000 --- a/src/commands/generate-schema.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "generate-schema", - description: "Generate a JSON schema from a natural language prompt", - }, - args: { - prompt: { - type: "positional", - description: "Describe the schema you need", - required: true, - }, - "existing-schema": { - type: "string", - description: "Existing schema to modify (as JSON string)", - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt }; - if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]); - - out.start("Generating schema"); - const result = await scrapegraphai.generateSchema(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/history.ts b/src/commands/history.ts index 99ab59e..bf844b7 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -1,11 +1,10 @@ import * as p from "@clack/prompts"; import chalk from "chalk"; import { defineCommand } from "citty"; -import { HISTORY_SERVICES } from "scrapegraph-js"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; +const HISTORY_SERVICES = ["scrape", "extract", "search", "monitor", "crawl"] as const; const VALID = HISTORY_SERVICES.join(", "); const LOAD_MORE = "__load_more__"; @@ -49,98 +48,107 @@ export default defineCommand({ required: true, }, page: { type: "string", description: "Page number (default: 1)" }, - "page-size": { type: "string", description: "Results per page (default: 10, max: 100)" }, + "page-size": { type: "string", description: "Results per page (default: 20, max: 100)" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const quiet = !!args.json; const out = log.create(quiet); - const key = await resolveApiKey(quiet); - const service = args.service as scrapegraphai.HistoryParams["service"]; + const sgai = await createClient(quiet); + const service = args.service as (typeof HISTORY_SERVICES)[number]; const requestId = (args as { _: string[] })._.at(1); - const pageSize = args["page-size"] ? Number(args["page-size"]) : 10; + const limit = args["page-size"] ? Number(args["page-size"]) : 20; let page = args.page ? Number(args.page) : 1; const fetchPage = async (pg: number) => { - const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize }); - if (r.status === "error") out.error(r.error); - const d = r.data as { requests: Record[]; next_key?: string }; - return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs }; + const t0 = performance.now(); + const r = await sgai.history({ service, page: pg, limit }); + const ms = Math.round(performance.now() - t0); + const d = r.data as { data?: Record[]; requests?: Record[]; next_key?: string; total?: number }; + return { rows: d.data ?? d.requests ?? [], hasMore: !!d.next_key || (d.total != null && pg * limit < d.total), ms }; }; if (quiet || requestId) { - const { rows } = await fetchPage(page); - if (requestId) { - const match = rows.find((r) => getId(r) === requestId); - if (!match) out.error(`Request ${requestId} not found on page ${page}`); - out.result(match); - return; + try { + const { rows } = await fetchPage(page); + if (requestId) { + const match = rows.find((r) => getId(r) === requestId); + if (!match) out.error(`Request ${requestId} not found on page ${page}`); + out.result(match); + return; + } + out.result(rows); + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); } - out.result(rows); return; } out.start(`Fetching ${service} history`); - const first = await fetchPage(page); - out.stop(first.ms); - - if (first.rows.length === 0) { - p.log.warning("No history found."); - return; - } + try { + const first = await fetchPage(page); + out.stop(first.ms); - const allRows = [...first.rows]; - let hasMore = first.hasMore; - - while (true) { - const options = allRows.map((row) => ({ - value: getId(row), - label: label(row), - hint: hint(row), - })); - - if (hasMore) { - options.push({ - value: LOAD_MORE, - label: chalk.blue.bold("↓ Load more…"), - hint: `page ${page + 1}`, - }); + if (first.rows.length === 0) { + p.log.warning("No history found."); + return; } - const selected = await p.select({ - message: `${allRows.length} requests β€” select one to view`, - options, - maxItems: 15, - }); + const allRows = [...first.rows]; + let hasMore = first.hasMore; + + while (true) { + const options = allRows.map((row) => ({ + value: getId(row), + label: label(row), + hint: hint(row), + })); + + if (hasMore) { + options.push({ + value: LOAD_MORE, + label: chalk.blue.bold("↓ Load more…"), + hint: `page ${page + 1}`, + }); + } - if (p.isCancel(selected)) { - p.cancel("Cancelled"); - return; - } + const selected = await p.select({ + message: `${allRows.length} requests β€” select one to view`, + options, + maxItems: 15, + }); - if (selected === LOAD_MORE) { - page++; - const ls = p.spinner(); - ls.start(`Loading page ${page}`); - const next = await fetchPage(page); - ls.stop("Done"); + if (p.isCancel(selected)) { + p.cancel("Cancelled"); + return; + } - if (next.rows.length === 0) { - hasMore = false; - p.log.warning("No more results."); + if (selected === LOAD_MORE) { + page++; + const ls = p.spinner(); + ls.start(`Loading page ${page}`); + const next = await fetchPage(page); + ls.stop("Done"); + + if (next.rows.length === 0) { + hasMore = false; + p.log.warning("No more results."); + continue; + } + + allRows.push(...next.rows); + hasMore = next.hasMore; continue; } - allRows.push(...next.rows); - hasMore = next.hasMore; - continue; - } - - const match = allRows.find((r) => getId(r) === selected); - if (match) out.result(match); + const match = allRows.find((r) => getId(r) === selected); + if (match) out.result(match); - const back = await p.confirm({ message: "Back to list?" }); - if (p.isCancel(back) || !back) return; + const back = await p.confirm({ message: "Back to list?" }); + if (p.isCancel(back) || !back) return; + } + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); } }, }); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index ccfc494..5aa9dbe 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,6 +1,5 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -20,21 +19,25 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/markdownify"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const sgai = await createClient(!!args.json); - const params: scrapegraphai.MarkdownifyParams = { - website_url: args.url, - }; + const fetchConfig: Record = {}; + if (args.stealth) fetchConfig.stealth = true; + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); + const scrapeOptions: Record = { format: "markdown" }; + if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Converting to markdown"); - const result = await scrapegraphai.markdownify(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.scrape(args.url, scrapeOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index b0517eb..8339f27 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,12 +1,11 @@ import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; +import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ meta: { name: "scrape", - description: "Get raw HTML content from a URL", + description: "Scrape content from a URL (markdown, html, screenshot, or branding)", }, args: { url: { @@ -14,27 +13,37 @@ export default defineCommand({ description: "Website URL to scrape", required: true, }, + format: { + type: "string", + alias: "f", + description: "Output format: markdown (default), html, screenshot, branding", + }, stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - branding: { type: "boolean", description: "Extract branding info (+2 credits)" }, - "country-code": { type: "string", description: "ISO country code for geo-targeting" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/scrape"); - const key = await resolveApiKey(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const sgai = await createClient(!!args.json); - const params: scrapegraphai.ScrapeParams = { website_url: args.url }; + const fetchConfig: Record = {}; + if (args.stealth) fetchConfig.stealth = true; + if (args.country) fetchConfig.country = args.country; - if (args.stealth) params.stealth = true; - if (args.branding) params.branding = true; - if (args["country-code"]) params.country_code = args["country-code"]; + const scrapeOptions: Record = {}; + if (args.format) scrapeOptions.format = args.format; + if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Scraping"); - const result = await scrapegraphai.scrape(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); + const t0 = performance.now(); + try { + const result = await sgai.scrape(args.url, scrapeOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } }, }); diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts deleted file mode 100644 index 041e32c..0000000 --- a/src/commands/search-scraper.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "search-scraper", - description: "Search the web and extract data with AI", - }, - args: { - prompt: { - type: "positional", - description: "Search query and extraction instructions", - required: true, - }, - "num-results": { - type: "string", - description: "Number of websites to scrape (3-20, default 3)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/site instead of 10)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/searchscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SearchScraperParams = { - user_prompt: args.prompt, - }; - - if (args["num-results"]) params.num_results = Number(args["num-results"]); - if (args["no-extraction"]) params.extraction_mode = false; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Searching"); - const result = await scrapegraphai.searchScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/search.ts b/src/commands/search.ts new file mode 100644 index 0000000..24b56c9 --- /dev/null +++ b/src/commands/search.ts @@ -0,0 +1,51 @@ +import { defineCommand } from "citty"; +import { createClient } from "../lib/client.js"; +import * as log from "../lib/log.js"; + +export default defineCommand({ + meta: { + name: "search", + description: "Search the web and extract data with AI", + }, + args: { + query: { + type: "positional", + description: "Search query", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt for search results", + }, + "num-results": { + type: "string", + description: "Number of websites to scrape (1-20, default 3)", + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/search"); + const sgai = await createClient(!!args.json); + + const searchOptions: Record = {}; + if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); + if (args.schema) searchOptions.schema = JSON.parse(args.schema); + if (args.prompt) searchOptions.prompt = args.prompt; + if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; + + out.start("Searching"); + const t0 = performance.now(); + try { + const result = await sgai.search(args.query, searchOptions as any); + out.stop(Math.round(performance.now() - t0)); + out.result(result.data); + } catch (err) { + out.stop(Math.round(performance.now() - t0)); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts deleted file mode 100644 index 2120b16..0000000 --- a/src/commands/sitemap.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "sitemap", - description: "Get all URLs from a website's sitemap", - }, - args: { - url: { - type: "positional", - description: "Website URL", - required: true, - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/sitemap"); - const key = await resolveApiKey(!!args.json); - - out.start("Fetching sitemap"); - const result = await scrapegraphai.sitemap(key, { website_url: args.url }); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts deleted file mode 100644 index be3d2a4..0000000 --- a/src/commands/smart-scraper.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "smart-scraper", - description: "Extract structured data from a URL using AI", - }, - args: { - url: { - type: "positional", - description: "Website URL to scrape", - required: true, - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt", - required: true, - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - pages: { type: "string", description: "Total pages to scrape (1-100)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - cookies: { type: "string", description: "Cookies as JSON object string" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - "plain-text": { type: "boolean", description: "Return plain text instead of JSON" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SmartScraperParams = { - website_url: args.url, - user_prompt: args.prompt, - }; - - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.scrolls) params.number_of_scrolls = Number(args.scrolls); - if (args.pages) params.total_pages = Number(args.pages); - if (args.stealth) params.stealth = true; - if (args.cookies) params.cookies = JSON.parse(args.cookies); - if (args.headers) params.headers = JSON.parse(args.headers); - if (args["plain-text"]) params.plain_text = true; - - out.start("Scraping"); - const result = await scrapegraphai.smartScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/validate.ts b/src/commands/validate.ts deleted file mode 100644 index dd2c81d..0000000 --- a/src/commands/validate.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "validate", - description: "Validate your API key (health check)", - }, - args: { - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - out.start("Checking API health"); - const result = await scrapegraphai.checkHealth(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/lib/client.ts b/src/lib/client.ts new file mode 100644 index 0000000..a83df1c --- /dev/null +++ b/src/lib/client.ts @@ -0,0 +1,18 @@ +import { scrapegraphai } from "scrapegraph-js"; +import { resolveApiKey } from "./folders.js"; + +let cached: ReturnType | null = null; + +export async function createClient(quiet = false) { + const apiKey = await resolveApiKey(quiet); + + if (cached) return cached; + + const baseUrl = process.env.SGAI_API_URL || undefined; + const timeout = process.env.SGAI_TIMEOUT_S + ? Number(process.env.SGAI_TIMEOUT_S) * 1000 + : undefined; + + cached = scrapegraphai({ apiKey, baseUrl, timeout }); + return cached; +} diff --git a/src/utils/banner.ts b/src/utils/banner.ts index 66c6386..d2bbe9c 100644 --- a/src/utils/banner.ts +++ b/src/utils/banner.ts @@ -30,8 +30,8 @@ export function showBanner() { console.log(text); console.log(chalk.hex(BANNER_COLOR)(TAGLINE)); console.log(chalk.hex(BANNER_COLOR)(`v${getVersion()}`)); - if (process.env.JUST_SCRAPE_API_URL) { - console.log(chalk.yellow(`β†’ Custom API: ${process.env.JUST_SCRAPE_API_URL}`)); + if (process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL) { + console.log(chalk.yellow(`β†’ Custom API: ${process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL}`)); } console.log(); } From 27be6f47f92fa06fd21e5e27e73cff04d44c3f91 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 9 Apr 2026 08:25:43 +0200 Subject: [PATCH 02/11] feat: replace --stealth flag with --mode fetch mode enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns CLI with scrapegraph-js v2 SDK change (b570a57) that replaced stealth/render booleans with a unified fetch mode enum: auto, fast, js, direct+stealth, js+stealth. - All commands: --stealth boolean β†’ --mode string - Pin SDK to commit b570a57 (includes fetch mode change) - Update README and SKILL.md with new flag syntax Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 15 +-- bun.lock | 4 +- package.json | 2 +- skills/just-scrape/SKILL.md | 221 +++++++++++++----------------------- src/commands/crawl.ts | 4 +- src/commands/extract.ts | 4 +- src/commands/markdownify.ts | 4 +- src/commands/scrape.ts | 4 +- 8 files changed, 96 insertions(+), 162 deletions(-) diff --git a/README.md b/README.md index 05b9eb0..74c9ae6 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Extract structured data from any URL using AI (replaces `smart-scraper`). [docs] just-scrape extract -p # Extract data with AI just-scrape extract -p --schema # Enforce output schema just-scrape extract -p --scrolls # Infinite scroll (0-100) -just-scrape extract -p --stealth # Anti-bot bypass (+4 credits) +just-scrape extract -p --mode direct+stealth # Anti-bot bypass just-scrape extract -p --cookies --headers just-scrape extract -p --country # Geo-targeting ``` @@ -122,7 +122,7 @@ just-scrape extract https://news.example.com -p "Get all article headlines and d # Scrape a JS-heavy SPA behind anti-bot protection just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ - --stealth + --mode js+stealth ``` ## Search @@ -161,7 +161,7 @@ just-scrape scrape # Markdown (default) just-scrape scrape -f html # Raw HTML just-scrape scrape -f screenshot # Screenshot just-scrape scrape -f branding # Extract branding info -just-scrape scrape --stealth # Anti-bot bypass (+4 credits) +just-scrape scrape -m direct+stealth # Anti-bot bypass just-scrape scrape --country # Geo-targeting ``` @@ -175,7 +175,7 @@ just-scrape scrape https://example.com just-scrape scrape https://example.com -f html # Scrape with anti-bot bypass and geo-targeting -just-scrape scrape https://store.example.com --stealth --country DE +just-scrape scrape https://store.example.com -m direct+stealth --country DE # Extract branding info (logos, colors, fonts) just-scrape scrape https://example.com -f branding @@ -189,7 +189,7 @@ Convert any webpage to clean markdown (convenience wrapper for `scrape --format ```bash just-scrape markdownify # Convert to markdown -just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) +just-scrape markdownify -m direct+stealth # Anti-bot bypass just-scrape markdownify --headers # Custom headers ``` @@ -200,7 +200,7 @@ just-scrape markdownify --headers # Custom headers just-scrape markdownify https://blog.example.com/my-article # Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com --stealth +just-scrape markdownify https://protected.example.com -m js+stealth # Pipe markdown to a file just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md @@ -218,7 +218,7 @@ just-scrape crawl --max-pages # Max pages (default 50) just-scrape crawl --max-depth # Crawl depth (default 2) just-scrape crawl --max-links-per-page # Links per page (default 10) just-scrape crawl --allow-external # Allow external domains -just-scrape crawl --stealth # Anti-bot bypass +just-scrape crawl -m direct+stealth # Anti-bot bypass ``` ### Examples @@ -285,6 +285,7 @@ Commands have been renamed to match the v2 API: | `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | | `scrape` | `scrape` | Gains `--format` flag (markdown, html, screenshot, branding) | | `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external` | +| `--stealth` flag | `--mode direct+stealth` | Fetch mode enum replaces boolean (`auto`, `fast`, `js`, `direct+stealth`, `js+stealth`) | | `agentic-scraper` | β€” | Removed from API | | `generate-schema` | β€” | Removed from API | | `sitemap` | β€” | Removed from API | diff --git a/bun.lock b/bun.lock index 1732297..a141378 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#b570a57", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#4b86432", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-4b86432"], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#b570a57", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-b570a57"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], diff --git a/package.json b/package.json index 2d8cd18..4ed21c6 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#feat/sdk-v2-migration" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#b570a57" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/skills/just-scrape/SKILL.md b/skills/just-scrape/SKILL.md index 18acea8..d9a3b57 100644 --- a/skills/just-scrape/SKILL.md +++ b/skills/just-scrape/SKILL.md @@ -1,6 +1,6 @@ --- name: just-scrape -description: "CLI tool for AI-powered web scraping, data extraction, search, and crawling via ScrapeGraph AI. Use when the user needs to scrape websites, extract structured data from URLs, convert pages to markdown, crawl multi-page sites, search the web for information, automate browser interactions (login, click, fill forms), get raw HTML, discover sitemaps, or generate JSON schemas. Triggers on tasks involving: (1) extracting data from websites, (2) web scraping or crawling, (3) converting webpages to markdown, (4) AI-powered web search with extraction, (5) browser automation, (6) generating output schemas for scraping. The CLI is just-scrape (npm package just-scrape)." +description: "CLI tool for AI-powered web scraping, data extraction, search, and crawling via ScrapeGraph AI v2 API. Use when the user needs to scrape websites, extract structured data from URLs, convert pages to markdown, crawl multi-page sites, or search the web for information. Triggers on tasks involving: (1) extracting data from websites, (2) web scraping or crawling, (3) converting webpages to markdown, (4) AI-powered web search with extraction. The CLI is just-scrape (npm package just-scrape)." --- # Web Scraping with just-scrape @@ -30,191 +30,132 @@ API key resolution order: `SGAI_API_KEY` env var β†’ `.env` file β†’ `~/.scrapeg | Need | Command | |---|---| -| Extract structured data from a known URL | `smart-scraper` | -| Search the web and extract from results | `search-scraper` | +| Extract structured data from a known URL | `extract` | +| Search the web and extract from results | `search` | +| Scrape a page (markdown, html, screenshot, branding) | `scrape` | | Convert a page to clean markdown | `markdownify` | | Crawl multiple pages from a site | `crawl` | -| Get raw HTML | `scrape` | -| Automate browser actions (login, click, fill) | `agentic-scraper` | -| Generate a JSON schema from description | `generate-schema` | -| Get all URLs from a sitemap | `sitemap` | | Check credit balance | `credits` | | Browse past requests | `history` | -| Validate API key | `validate` | ## Common Flags All commands support `--json` for machine-readable output (suppresses banner, spinners, prompts). Scraping commands share these optional flags: -- `--stealth` β€” bypass anti-bot detection (+4 credits) +- `--mode ` / `-m ` β€” fetch mode: `auto` (default), `fast`, `js`, `direct+stealth`, `js+stealth` - `--headers ` β€” custom HTTP headers as JSON string - `--schema ` β€” enforce output JSON schema +- `--country ` β€” ISO country code for geo-targeting ## Commands -### Smart Scraper +### Extract Extract structured data from any URL using AI. ```bash -just-scrape smart-scraper -p -just-scrape smart-scraper -p --schema -just-scrape smart-scraper -p --scrolls # infinite scroll (0-100) -just-scrape smart-scraper -p --pages # multi-page (1-100) -just-scrape smart-scraper -p --stealth # anti-bot (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text +just-scrape extract -p +just-scrape extract -p --schema +just-scrape extract -p --scrolls # infinite scroll (0-100) +just-scrape extract -p --mode js+stealth # anti-bot bypass +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # geo-targeting ``` ```bash # E-commerce extraction -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" +just-scrape extract https://store.example.com/shoes -p "Extract all product names, prices, and ratings" # Strict schema + scrolling -just-scrape smart-scraper https://news.example.com -p "Get headlines and dates" \ +just-scrape extract https://news.example.com -p "Get headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 # JS-heavy SPA behind anti-bot -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ - --stealth +just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ + --mode js+stealth ``` -### Search Scraper +### Search Search the web and extract structured data from results. ```bash -just-scrape search-scraper -just-scrape search-scraper --num-results # sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # markdown only (2 credits vs 10) -just-scrape search-scraper --schema -just-scrape search-scraper --stealth --headers +just-scrape search +just-scrape search --num-results # sources to scrape (1-20, default 3) +just-scrape search -p # extraction prompt +just-scrape search --schema +just-scrape search --headers ``` ```bash # Research across sources -just-scrape search-scraper "Best Python web frameworks in 2025" --num-results 10 - -# Cheap markdown-only -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 +just-scrape search "Best Python web frameworks in 2025" --num-results 10 # Structured output -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -### Markdownify - -Convert any webpage to clean markdown. - -```bash -just-scrape markdownify -just-scrape markdownify --stealth # +4 credits -just-scrape markdownify --headers -``` - -```bash -just-scrape markdownify https://blog.example.com/my-article -just-scrape markdownify https://protected.example.com --stealth -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md -``` - -### Crawl - -Crawl multiple pages and extract data from each. - -```bash -just-scrape crawl -p -just-scrape crawl -p --max-pages # default 10 -just-scrape crawl -p --depth # default 1 -just-scrape crawl --no-extraction --max-pages # markdown only (2 credits/page) -just-scrape crawl -p --schema -just-scrape crawl -p --rules # include_paths, same_domain -just-scrape crawl -p --no-sitemap -just-scrape crawl -p --stealth -``` - -```bash -# Crawl docs site -just-scrape crawl https://docs.example.com -p "Extract all code snippets" --max-pages 20 --depth 3 - -# Filter to blog pages only -just-scrape crawl https://example.com -p "Extract article titles" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 - -# Raw markdown, no AI extraction (cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 -``` - ### Scrape -Get raw HTML content from a URL. +Scrape content from a URL in various formats. ```bash -just-scrape scrape -just-scrape scrape --stealth # +4 credits -just-scrape scrape --branding # extract logos/colors/fonts (+2 credits) -just-scrape scrape --country-code +just-scrape scrape # markdown (default) +just-scrape scrape -f html # raw HTML +just-scrape scrape -f screenshot # screenshot +just-scrape scrape -f branding # extract branding info +just-scrape scrape -m direct+stealth # anti-bot bypass +just-scrape scrape --country # geo-targeting ``` ```bash just-scrape scrape https://example.com -just-scrape scrape https://store.example.com --stealth --country-code DE -just-scrape scrape https://example.com --branding +just-scrape scrape https://example.com -f html +just-scrape scrape https://store.example.com -m direct+stealth --country DE +just-scrape scrape https://example.com -f branding ``` -### Agentic Scraper +### Markdownify -Browser automation with AI β€” login, click, navigate, fill forms. Steps are comma-separated strings. +Convert any webpage to clean markdown (convenience wrapper for `scrape --format markdown`). ```bash -just-scrape agentic-scraper -s -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # persist browser session +just-scrape markdownify +just-scrape markdownify -m direct+stealth +just-scrape markdownify --headers ``` ```bash -# Login + extract dashboard -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session +just-scrape markdownify https://blog.example.com/my-article +just-scrape markdownify https://protected.example.com -m js+stealth +just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md ``` -### Generate Schema +### Crawl -Generate a JSON schema from a natural language description. +Crawl multiple pages. The CLI starts the crawl and polls until completion. ```bash -just-scrape generate-schema -just-scrape generate-schema --existing-schema +just-scrape crawl +just-scrape crawl --max-pages # default 50 +just-scrape crawl --max-depth # default 2 +just-scrape crawl --max-links-per-page # default 10 +just-scrape crawl --allow-external # allow external domains +just-scrape crawl -m direct+stealth # anti-bot bypass ``` ```bash -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" - -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' -``` - -### Sitemap +# Crawl docs site +just-scrape crawl https://docs.example.com --max-pages 20 --max-depth 3 -Get all URLs from a website's sitemap. +# Crawl staying within domain +just-scrape crawl https://example.com --max-pages 50 -```bash -just-scrape sitemap -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Get crawl results as JSON +just-scrape crawl https://example.com --json --max-pages 10 ``` ### History @@ -225,67 +166,59 @@ Browse request history. Interactive by default (arrow keys to navigate, select t just-scrape history # interactive browser just-scrape history # specific request just-scrape history --page -just-scrape history --page-size # max 100 +just-scrape history --page-size # default 20, max 100 just-scrape history --json ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `search`, `monitor`, `crawl` ```bash -just-scrape history smartscraper -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +just-scrape history extract +just-scrape history crawl --json --page-size 100 | jq '.[].status' ``` -### Credits & Validate +### Credits ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' -just-scrape validate +just-scrape credits --json | jq '.remainingCredits' ``` ## Common Patterns -### Generate schema then scrape with it - -```bash -just-scrape generate-schema "Product with name, price, and reviews" --json | jq '.schema' > schema.json -just-scrape smart-scraper https://store.example.com -p "Extract products" --schema "$(cat schema.json)" -``` - ### Pipe JSON for scripting ```bash -just-scrape sitemap https://example.com --json | jq -r '.urls[]' | while read url; do - just-scrape smart-scraper "$url" -p "Extract title" --json >> results.jsonl -done +just-scrape extract https://example.com -p "Extract all links" --json | jq '.data' ``` ### Protected sites ```bash # JS-heavy SPA behind Cloudflare -just-scrape smart-scraper https://protected.example.com -p "Extract data" --stealth +just-scrape extract https://protected.example.com -p "Extract data" --mode js+stealth # With custom cookies/headers -just-scrape smart-scraper https://example.com -p "Extract data" \ +just-scrape extract https://example.com -p "Extract data" \ --cookies '{"session":"abc123"}' --headers '{"Authorization":"Bearer token"}' ``` -## Credit Costs +## Fetch Modes -| Feature | Extra Credits | +| Mode | Description | |---|---| -| `--stealth` | +4 per request | -| `--branding` (scrape only) | +2 | -| `search-scraper` extraction | 10 per request | -| `search-scraper --no-extraction` | 2 per request | -| `crawl --no-extraction` | 2 per page | +| `auto` | Automatic selection (default) | +| `fast` | Fastest, no JS rendering | +| `js` | Full JS rendering | +| `direct+stealth` | Direct fetch with anti-bot bypass | +| `js+stealth` | JS rendering with anti-bot bypass | ## Environment Variables ```bash SGAI_API_KEY=sgai-... # API key -JUST_SCRAPE_TIMEOUT_S=300 # Request timeout in seconds (default 120) -JUST_SCRAPE_DEBUG=1 # Debug logging to stderr +SGAI_API_URL=... # Override API base URL (default: https://api.scrapegraphai.com) +SGAI_TIMEOUT_S=30 # Request timeout in seconds (default 30) ``` + +Legacy variables (`JUST_SCRAPE_API_URL`, `JUST_SCRAPE_TIMEOUT_S`, `JUST_SCRAPE_DEBUG`) are still bridged. diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 3b23357..5d8e80a 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -19,7 +19,7 @@ export default defineCommand({ "max-depth": { type: "string", description: "Crawl depth (default 2)" }, "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, "allow-external": { type: "boolean", description: "Allow crawling external domains" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { @@ -33,7 +33,7 @@ export default defineCommand({ if (args["max-links-per-page"]) crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); if (args["allow-external"]) crawlOptions.allowExternal = true; - if (args.stealth) crawlOptions.fetchConfig = { stealth: true }; + if (args.mode) crawlOptions.fetchConfig = { mode: args.mode }; out.start("Crawling"); const t0 = performance.now(); diff --git a/src/commands/extract.ts b/src/commands/extract.ts index bb0be8f..0650c77 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -21,7 +21,7 @@ export default defineCommand({ }, schema: { type: "string", description: "Output JSON schema (as JSON string)" }, scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + mode: { type: "string", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, cookies: { type: "string", description: "Cookies as JSON object string" }, headers: { type: "string", description: "Custom headers as JSON object string" }, country: { type: "string", description: "ISO country code for geo-targeting" }, @@ -34,7 +34,7 @@ export default defineCommand({ const fetchConfig: Record = {}; if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); - if (args.stealth) fetchConfig.stealth = true; + if (args.mode) fetchConfig.mode = args.mode; if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); if (args.headers) fetchConfig.headers = JSON.parse(args.headers); if (args.country) fetchConfig.country = args.country; diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index 5aa9dbe..42c8a4d 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -13,7 +13,7 @@ export default defineCommand({ description: "Website URL to convert", required: true, }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, headers: { type: "string", description: "Custom headers as JSON object string" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -23,7 +23,7 @@ export default defineCommand({ const sgai = await createClient(!!args.json); const fetchConfig: Record = {}; - if (args.stealth) fetchConfig.stealth = true; + if (args.mode) fetchConfig.mode = args.mode; if (args.headers) fetchConfig.headers = JSON.parse(args.headers); const scrapeOptions: Record = { format: "markdown" }; diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 8339f27..e3a3702 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -18,7 +18,7 @@ export default defineCommand({ alias: "f", description: "Output format: markdown (default), html, screenshot, branding", }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, country: { type: "string", description: "ISO country code for geo-targeting" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -28,7 +28,7 @@ export default defineCommand({ const sgai = await createClient(!!args.json); const fetchConfig: Record = {}; - if (args.stealth) fetchConfig.stealth = true; + if (args.mode) fetchConfig.mode = args.mode; if (args.country) fetchConfig.country = args.country; const scrapeOptions: Record = {}; From af777a8162f71cd637e4810a0a766cb1caa1d40b Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 13 Apr 2026 14:01:21 +0200 Subject: [PATCH 03/11] feat: align CLI with scrapegraph-js v2 c5bf757 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bump scrapegraph-js pin b570a57 β†’ c5bf757 - scrape: support 8 formats (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma-separated -f, add --html-mode, --scrolls, --prompt/--schema for json format - search: add --location-geo-code, --time-range, --format - crawl: add --format flag - README: document new flags and formats Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 56 +++++++++++++++++++---------- bun.lock | 4 +-- package.json | 2 +- src/commands/crawl.ts | 19 ++++++++-- src/commands/scrape.ts | 81 +++++++++++++++++++++++++++++++++++++++--- src/commands/search.ts | 16 +++++++++ 6 files changed, 149 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 74c9ae6..d007ff3 100644 --- a/README.md +++ b/README.md @@ -132,10 +132,13 @@ Search the web and extract structured data from results (replaces `search-scrape ### Usage ```bash -just-scrape search # AI-powered web search -just-scrape search --num-results # Sources to scrape (1-20, default 3) -just-scrape search -p # Extraction prompt for results -just-scrape search --schema # Enforce output schema +just-scrape search # AI-powered web search +just-scrape search --num-results # Sources to scrape (1-20, default 3) +just-scrape search -p # Extraction prompt for results +just-scrape search --schema # Enforce output schema +just-scrape search --location-geo-code # Geo-target search (e.g. 'us', 'de', 'jp-tk') +just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year +just-scrape search --format # Result format (default markdown) just-scrape search --headers ``` @@ -145,6 +148,9 @@ just-scrape search --headers # Research a topic across multiple sources just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10 +# Recent news only, scoped to Germany +just-scrape search "EU AI act latest news" --time-range past_week --location-geo-code de + # Structured output with schema just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' @@ -152,33 +158,43 @@ just-scrape search "Top 5 cloud providers pricing" \ ## Scrape -Scrape content from a URL in various formats: markdown (default), html, screenshot, or branding. [docs](https://docs.scrapegraphai.com/api-reference/scrape) +Scrape content from a URL in one or more formats. The v2 API supports **8 formats**: `markdown`, `html`, `screenshot`, `branding`, `links`, `images`, `summary`, `json`. [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape scrape # Markdown (default) -just-scrape scrape -f html # Raw HTML -just-scrape scrape -f screenshot # Screenshot -just-scrape scrape -f branding # Extract branding info -just-scrape scrape -m direct+stealth # Anti-bot bypass -just-scrape scrape --country # Geo-targeting +just-scrape scrape # Markdown (default) +just-scrape scrape -f html # Raw HTML +just-scrape scrape -f screenshot # Page screenshot +just-scrape scrape -f branding # Branding (logos, colors, fonts) +just-scrape scrape -f links # Extracted links +just-scrape scrape -f images # Extracted images +just-scrape scrape -f summary # AI-generated page summary +just-scrape scrape -f json -p # Structured JSON via prompt +just-scrape scrape -f markdown,links,images # Multi-format (comma-separated) +just-scrape scrape --html-mode reader # normal (default), reader, or prune +just-scrape scrape --scrolls # Infinite scroll (0-100) +just-scrape scrape -m direct+stealth # Anti-bot bypass +just-scrape scrape --country # Geo-targeting ``` ### Examples ```bash -# Get markdown of a page +# Markdown of a page just-scrape scrape https://example.com -# Get raw HTML -just-scrape scrape https://example.com -f html +# Raw HTML with reader-mode extraction +just-scrape scrape https://blog.example.com -f html --html-mode reader + +# Multi-format: markdown + links + images in a single call +just-scrape scrape https://example.com -f markdown,links,images + +# Structured JSON output with a prompt +just-scrape scrape https://store.example.com -f json -p "Extract product name and price" # Scrape with anti-bot bypass and geo-targeting just-scrape scrape https://store.example.com -m direct+stealth --country DE - -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com -f branding ``` ## Markdownify @@ -218,6 +234,7 @@ just-scrape crawl --max-pages # Max pages (default 50) just-scrape crawl --max-depth # Crawl depth (default 2) just-scrape crawl --max-links-per-page # Links per page (default 10) just-scrape crawl --allow-external # Allow external domains +just-scrape crawl -f html # Page format (default markdown) just-scrape crawl -m direct+stealth # Anti-bot bypass ``` @@ -283,8 +300,9 @@ Commands have been renamed to match the v2 API: | `smart-scraper` | `extract` | Renamed | | `search-scraper` | `search` | Renamed | | `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | -| `scrape` | `scrape` | Gains `--format` flag (markdown, html, screenshot, branding) | -| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external` | +| `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` | +| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external`, `--format` | +| `search` | `search` | New options: `--location-geo-code`, `--time-range`, `--format` | | `--stealth` flag | `--mode direct+stealth` | Fetch mode enum replaces boolean (`auto`, `fast`, `js`, `direct+stealth`, `js+stealth`) | | `agentic-scraper` | β€” | Removed from API | | `generate-schema` | β€” | Removed from API | diff --git a/bun.lock b/bun.lock index a141378..4ef0a87 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#b570a57", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#c5bf757", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#b570a57", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-b570a57"], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#c5bf757", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-c5bf757"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], diff --git a/package.json b/package.json index 4ed21c6..e36766b 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#b570a57" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#c5bf757" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 5d8e80a..da87044 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -19,7 +19,17 @@ export default defineCommand({ "max-depth": { type: "string", description: "Crawl depth (default 2)" }, "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, "allow-external": { type: "boolean", description: "Allow crawling external domains" }, - mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, + format: { + type: "string", + alias: "f", + description: + "Page format: markdown (default), html, screenshot, branding, links, images, summary", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth", + }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { @@ -33,6 +43,7 @@ export default defineCommand({ if (args["max-links-per-page"]) crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); if (args["allow-external"]) crawlOptions.allowExternal = true; + if (args.format) crawlOptions.format = args.format; if (args.mode) crawlOptions.fetchConfig = { mode: args.mode }; out.start("Crawling"); @@ -54,7 +65,11 @@ export default defineCommand({ const statusData = status.data as { status: string; [key: string]: unknown }; out.poll(statusData.status); - if (statusData.status === "completed" || statusData.status === "failed" || statusData.status === "cancelled") { + if ( + statusData.status === "completed" || + statusData.status === "failed" || + statusData.status === "cancelled" + ) { out.stop(Math.round(performance.now() - t0)); out.result(status.data); return; diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index e3a3702..13c5c99 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -2,10 +2,23 @@ import { defineCommand } from "citty"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; +const FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", + "json", +] as const; +type Format = (typeof FORMATS)[number]; + export default defineCommand({ meta: { name: "scrape", - description: "Scrape content from a URL (markdown, html, screenshot, or branding)", + description: + "Scrape content from a URL (markdown, html, screenshot, branding, links, images, summary, json)", }, args: { url: { @@ -16,9 +29,27 @@ export default defineCommand({ format: { type: "string", alias: "f", - description: "Output format: markdown (default), html, screenshot, branding", + description: `Output format: ${FORMATS.join(", ")} (default: markdown). Comma-separate for multi-format output.`, + }, + prompt: { + type: "string", + alias: "p", + description: "Prompt for json format (required when --format includes json)", }, - mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, + schema: { + type: "string", + description: "Schema for json format (JSON string)", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth", + }, + "html-mode": { + type: "string", + description: "HTML/markdown extraction mode: normal (default), reader, prune", + }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, country: { type: "string", description: "ISO country code for geo-targeting" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -29,10 +60,50 @@ export default defineCommand({ const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); if (args.country) fetchConfig.country = args.country; - const scrapeOptions: Record = {}; - if (args.format) scrapeOptions.format = args.format; + const requestedFormats = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean) as Format[]; + const htmlMode = (args["html-mode"] as "normal" | "reader" | "prune" | undefined) ?? "normal"; + + const formats = requestedFormats.map((f) => { + switch (f) { + case "markdown": + return { type: "markdown" as const, mode: htmlMode }; + case "html": + return { type: "html" as const, mode: htmlMode }; + case "screenshot": + return { type: "screenshot" as const }; + case "branding": + return { type: "branding" as const }; + case "links": + return { type: "links" as const }; + case "images": + return { type: "images" as const }; + case "summary": + return { type: "summary" as const }; + case "json": { + if (!args.prompt) { + out.error("--prompt is required when --format includes json"); + return { type: "json" as const }; + } + return { + type: "json" as const, + prompt: args.prompt, + schema: args.schema ? JSON.parse(args.schema) : undefined, + mode: htmlMode, + }; + } + default: + out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`); + return { type: "markdown" as const, mode: htmlMode }; + } + }); + + const scrapeOptions: Record = { formats }; if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Scraping"); diff --git a/src/commands/search.ts b/src/commands/search.ts index 24b56c9..1d4a51c 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -23,6 +23,19 @@ export default defineCommand({ description: "Number of websites to scrape (1-20, default 3)", }, schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + "location-geo-code": { + type: "string", + description: "Geo-location code for search (e.g. 'us', 'de', 'jp-tk')", + }, + "time-range": { + type: "string", + description: + "Filter results by recency: past_hour, past_24_hours, past_week, past_month, past_year", + }, + format: { + type: "string", + description: "Result format: markdown (default) or html", + }, headers: { type: "string", description: "Custom headers as JSON object string" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -35,6 +48,9 @@ export default defineCommand({ if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); if (args.schema) searchOptions.schema = JSON.parse(args.schema); if (args.prompt) searchOptions.prompt = args.prompt; + if (args["location-geo-code"]) searchOptions.locationGeoCode = args["location-geo-code"]; + if (args["time-range"]) searchOptions.timeRange = args["time-range"]; + if (args.format) searchOptions.format = args.format; if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; out.start("Searching"); From dc766a8bc24aee0a2682f31cd9aa62e7041f64c6 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 13 Apr 2026 14:04:47 +0200 Subject: [PATCH 04/11] fix(ci): build scrapegraph-js in-place and update smoke test for v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDK is pinned to a GitHub commit (not on npm yet) and ships without a prebuilt dist/, so module resolution fails right after bun install. Build it as a post-install CI step until v2 lands on npm. Also rewrite tests/smoke.test.ts β€” the old test still imported the v1 symbols (smartScraper, HISTORY_SERVICES) that no longer exist; replace with a sanity check against the v2 scrapegraphai() factory. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 4 ++++ tests/smoke.test.ts | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b404665..516fb29 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,9 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + # scrapegraph-js is pinned to a GitHub commit (not yet on npm) and ships + # without a prebuilt dist/ β€” build it in-place so module resolution works. + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun test lint: @@ -23,4 +26,5 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun run check diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index e2fab44..40ba725 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -1,7 +1,15 @@ import { expect, test } from "bun:test"; -import { HISTORY_SERVICES, smartScraper } from "scrapegraph-js"; +import { scrapegraphai } from "scrapegraph-js"; -test("sdk exports are available", () => { - expect(typeof smartScraper).toBe("function"); - expect(HISTORY_SERVICES.length).toBeGreaterThan(0); +test("sdk v2 factory is callable and exposes expected methods", () => { + expect(typeof scrapegraphai).toBe("function"); + + const client = scrapegraphai({ apiKey: "sgai-test" }); + expect(typeof client.scrape).toBe("function"); + expect(typeof client.extract).toBe("function"); + expect(typeof client.search).toBe("function"); + expect(typeof client.credits).toBe("function"); + expect(typeof client.history).toBe("function"); + expect(typeof client.crawl.start).toBe("function"); + expect(typeof client.crawl.status).toBe("function"); }); From fd4ad48ba9530fa2a59e92759c92429a493fe518 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 09:13:12 +0200 Subject: [PATCH 05/11] feat: align CLI with scrapegraph-js v2 PR #11 (016ae8b) Split compound fetch modes (direct+stealth, js+stealth) into separate --mode (auto|fast|js) and --stealth boolean flag. Add --nationality param to search command. Update SDK dependency to latest PR commit. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 21 +++++++++++---------- bun.lock | 4 ++-- package.json | 2 +- src/commands/crawl.ts | 8 ++++++-- src/commands/extract.ts | 4 +++- src/commands/history.ts | 13 +++++++++++-- src/commands/markdownify.ts | 4 +++- src/commands/scrape.ts | 4 +++- src/commands/search.ts | 5 +++++ src/utils/banner.ts | 4 +++- 10 files changed, 48 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index d007ff3..06d223d 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Extract structured data from any URL using AI (replaces `smart-scraper`). [docs] just-scrape extract -p # Extract data with AI just-scrape extract -p --schema # Enforce output schema just-scrape extract -p --scrolls # Infinite scroll (0-100) -just-scrape extract -p --mode direct+stealth # Anti-bot bypass +just-scrape extract -p --mode js --stealth # Anti-bot bypass just-scrape extract -p --cookies --headers just-scrape extract -p --country # Geo-targeting ``` @@ -120,9 +120,9 @@ just-scrape extract https://news.example.com -p "Get all article headlines and d --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 -# Scrape a JS-heavy SPA behind anti-bot protection +# Scrape a JS-heavy SPA with stealth mode just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ - --mode js+stealth + --mode js --stealth ``` ## Search @@ -139,6 +139,7 @@ just-scrape search --schema # Enforce output s just-scrape search --location-geo-code # Geo-target search (e.g. 'us', 'de', 'jp-tk') just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year just-scrape search --format # Result format (default markdown) +just-scrape search --nationality # 2-letter ISO nationality code just-scrape search --headers ``` @@ -174,7 +175,7 @@ just-scrape scrape -f json -p # Structured JSON via just-scrape scrape -f markdown,links,images # Multi-format (comma-separated) just-scrape scrape --html-mode reader # normal (default), reader, or prune just-scrape scrape --scrolls # Infinite scroll (0-100) -just-scrape scrape -m direct+stealth # Anti-bot bypass +just-scrape scrape -m js --stealth # Anti-bot bypass just-scrape scrape --country # Geo-targeting ``` @@ -193,8 +194,8 @@ just-scrape scrape https://example.com -f markdown,links,images # Structured JSON output with a prompt just-scrape scrape https://store.example.com -f json -p "Extract product name and price" -# Scrape with anti-bot bypass and geo-targeting -just-scrape scrape https://store.example.com -m direct+stealth --country DE +# Scrape with stealth mode and geo-targeting +just-scrape scrape https://store.example.com --stealth --country DE ``` ## Markdownify @@ -205,7 +206,7 @@ Convert any webpage to clean markdown (convenience wrapper for `scrape --format ```bash just-scrape markdownify # Convert to markdown -just-scrape markdownify -m direct+stealth # Anti-bot bypass +just-scrape markdownify -m js --stealth # Anti-bot bypass just-scrape markdownify --headers # Custom headers ``` @@ -216,7 +217,7 @@ just-scrape markdownify --headers # Custom headers just-scrape markdownify https://blog.example.com/my-article # Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com -m js+stealth +just-scrape markdownify https://protected.example.com -m js --stealth # Pipe markdown to a file just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md @@ -235,7 +236,7 @@ just-scrape crawl --max-depth # Crawl depth (default 2) just-scrape crawl --max-links-per-page # Links per page (default 10) just-scrape crawl --allow-external # Allow external domains just-scrape crawl -f html # Page format (default markdown) -just-scrape crawl -m direct+stealth # Anti-bot bypass +just-scrape crawl -m js --stealth # Anti-bot bypass ``` ### Examples @@ -303,7 +304,7 @@ Commands have been renamed to match the v2 API: | `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` | | `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external`, `--format` | | `search` | `search` | New options: `--location-geo-code`, `--time-range`, `--format` | -| `--stealth` flag | `--mode direct+stealth` | Fetch mode enum replaces boolean (`auto`, `fast`, `js`, `direct+stealth`, `js+stealth`) | +| `--stealth` flag | `--stealth` | Separate boolean flag; fetch mode is now `auto`, `fast`, or `js` | | `agentic-scraper` | β€” | Removed from API | | `generate-schema` | β€” | Removed from API | | `sitemap` | β€” | Removed from API | diff --git a/bun.lock b/bun.lock index 4ef0a87..12fae36 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#c5bf757", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#016ae8b", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#c5bf757", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-c5bf757"], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#016ae8b", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-016ae8b"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], diff --git a/package.json b/package.json index e36766b..0151389 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#c5bf757" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#016ae8b" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index da87044..55db716 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -28,8 +28,9 @@ export default defineCommand({ mode: { type: "string", alias: "m", - description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth", + description: "Fetch mode: auto (default), fast, js", }, + stealth: { type: "boolean", description: "Enable stealth mode" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, run: async ({ args }) => { @@ -44,7 +45,10 @@ export default defineCommand({ crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); if (args["allow-external"]) crawlOptions.allowExternal = true; if (args.format) crawlOptions.format = args.format; - if (args.mode) crawlOptions.fetchConfig = { mode: args.mode }; + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (Object.keys(fetchConfig).length > 0) crawlOptions.fetchConfig = fetchConfig; out.start("Crawling"); const t0 = performance.now(); diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 0650c77..ec3490d 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -21,7 +21,8 @@ export default defineCommand({ }, schema: { type: "string", description: "Output JSON schema (as JSON string)" }, scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - mode: { type: "string", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, + mode: { type: "string", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, cookies: { type: "string", description: "Cookies as JSON object string" }, headers: { type: "string", description: "Custom headers as JSON object string" }, country: { type: "string", description: "ISO country code for geo-targeting" }, @@ -35,6 +36,7 @@ export default defineCommand({ const fetchConfig: Record = {}; if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); if (args.headers) fetchConfig.headers = JSON.parse(args.headers); if (args.country) fetchConfig.country = args.country; diff --git a/src/commands/history.ts b/src/commands/history.ts index bf844b7..d9a19a1 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -64,8 +64,17 @@ export default defineCommand({ const t0 = performance.now(); const r = await sgai.history({ service, page: pg, limit }); const ms = Math.round(performance.now() - t0); - const d = r.data as { data?: Record[]; requests?: Record[]; next_key?: string; total?: number }; - return { rows: d.data ?? d.requests ?? [], hasMore: !!d.next_key || (d.total != null && pg * limit < d.total), ms }; + const d = r.data as { + data?: Record[]; + requests?: Record[]; + next_key?: string; + total?: number; + }; + return { + rows: d.data ?? d.requests ?? [], + hasMore: !!d.next_key || (d.total != null && pg * limit < d.total), + ms, + }; }; if (quiet || requestId) { diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index 42c8a4d..1de3a99 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -13,7 +13,8 @@ export default defineCommand({ description: "Website URL to convert", required: true, }, - mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth" }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, headers: { type: "string", description: "Custom headers as JSON object string" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -24,6 +25,7 @@ export default defineCommand({ const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; if (args.headers) fetchConfig.headers = JSON.parse(args.headers); const scrapeOptions: Record = { format: "markdown" }; diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 13c5c99..0f7e4f6 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -43,8 +43,9 @@ export default defineCommand({ mode: { type: "string", alias: "m", - description: "Fetch mode: auto (default), fast, js, direct+stealth, js+stealth", + description: "Fetch mode: auto (default), fast, js", }, + stealth: { type: "boolean", description: "Enable stealth mode" }, "html-mode": { type: "string", description: "HTML/markdown extraction mode: normal (default), reader, prune", @@ -60,6 +61,7 @@ export default defineCommand({ const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); if (args.country) fetchConfig.country = args.country; diff --git a/src/commands/search.ts b/src/commands/search.ts index 1d4a51c..7d39afd 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -36,6 +36,10 @@ export default defineCommand({ type: "string", description: "Result format: markdown (default) or html", }, + nationality: { + type: "string", + description: "2-letter ISO nationality code for search personalization", + }, headers: { type: "string", description: "Custom headers as JSON object string" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -51,6 +55,7 @@ export default defineCommand({ if (args["location-geo-code"]) searchOptions.locationGeoCode = args["location-geo-code"]; if (args["time-range"]) searchOptions.timeRange = args["time-range"]; if (args.format) searchOptions.format = args.format; + if (args.nationality) searchOptions.nationality = args.nationality; if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; out.start("Searching"); diff --git a/src/utils/banner.ts b/src/utils/banner.ts index d2bbe9c..8ba47a8 100644 --- a/src/utils/banner.ts +++ b/src/utils/banner.ts @@ -31,7 +31,9 @@ export function showBanner() { console.log(chalk.hex(BANNER_COLOR)(TAGLINE)); console.log(chalk.hex(BANNER_COLOR)(`v${getVersion()}`)); if (process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL) { - console.log(chalk.yellow(`β†’ Custom API: ${process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL}`)); + console.log( + chalk.yellow(`β†’ Custom API: ${process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL}`), + ); } console.log(); } From 770c351e9e8db26bd6e5beccc0d411182e4dbd14 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 09:16:08 +0200 Subject: [PATCH 06/11] fix(lint): replace `as any` casts with proper SDK types Import ApiExtractOptions, ApiScrapeOptions, ApiSearchOptions, and ApiCrawlOptions from scrapegraph-js to satisfy biome noExplicitAny rule. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/commands/crawl.ts | 5 +++-- src/commands/extract.ts | 5 +++-- src/commands/markdownify.ts | 5 +++-- src/commands/scrape.ts | 5 +++-- src/commands/search.ts | 5 +++-- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 55db716..be6c00b 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,4 +1,5 @@ import { defineCommand } from "citty"; +import type { ApiCrawlOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -38,7 +39,7 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); const sgai = await createClient(!!args.json); - const crawlOptions: Record = {}; + const crawlOptions: ApiCrawlOptions = {}; if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]); if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]); if (args["max-links-per-page"]) @@ -53,7 +54,7 @@ export default defineCommand({ out.start("Crawling"); const t0 = performance.now(); try { - const job = await sgai.crawl.start(args.url, crawlOptions as any); + const job = await sgai.crawl.start(args.url, crawlOptions); const jobId = (job.data as { id: string }).id; if (!jobId) { diff --git a/src/commands/extract.ts b/src/commands/extract.ts index ec3490d..7ada900 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -1,4 +1,5 @@ import { defineCommand } from "citty"; +import type { ApiExtractOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -41,14 +42,14 @@ export default defineCommand({ if (args.headers) fetchConfig.headers = JSON.parse(args.headers); if (args.country) fetchConfig.country = args.country; - const extractOptions: Record = { prompt: args.prompt }; + const extractOptions: ApiExtractOptions = { prompt: args.prompt }; if (args.schema) extractOptions.schema = JSON.parse(args.schema); if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig; out.start("Extracting"); const t0 = performance.now(); try { - const result = await sgai.extract(args.url, extractOptions as any); + const result = await sgai.extract(args.url, extractOptions); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index 1de3a99..ee99885 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,4 +1,5 @@ import { defineCommand } from "citty"; +import type { ApiScrapeOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -28,13 +29,13 @@ export default defineCommand({ if (args.stealth) fetchConfig.stealth = true; if (args.headers) fetchConfig.headers = JSON.parse(args.headers); - const scrapeOptions: Record = { format: "markdown" }; + const scrapeOptions: ApiScrapeOptions = { format: "markdown" }; if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Converting to markdown"); const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions as any); + const result = await sgai.scrape(args.url, scrapeOptions); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 0f7e4f6..a5c7bf7 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,4 +1,5 @@ import { defineCommand } from "citty"; +import type { ApiScrapeOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -105,13 +106,13 @@ export default defineCommand({ } }); - const scrapeOptions: Record = { formats }; + const scrapeOptions: ApiScrapeOptions = { formats }; if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Scraping"); const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions as any); + const result = await sgai.scrape(args.url, scrapeOptions); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/search.ts b/src/commands/search.ts index 7d39afd..b1adbaa 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -1,4 +1,5 @@ import { defineCommand } from "citty"; +import type { ApiSearchOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -48,7 +49,7 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/api-reference/search"); const sgai = await createClient(!!args.json); - const searchOptions: Record = {}; + const searchOptions: ApiSearchOptions = {}; if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); if (args.schema) searchOptions.schema = JSON.parse(args.schema); if (args.prompt) searchOptions.prompt = args.prompt; @@ -61,7 +62,7 @@ export default defineCommand({ out.start("Searching"); const t0 = performance.now(); try { - const result = await sgai.search(args.query, searchOptions as any); + const result = await sgai.search(args.query, searchOptions); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { From 0bcd66f408ef21c13f50a68ec38a4c120d77c0f5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 09:18:27 +0200 Subject: [PATCH 07/11] fix(lint): allow noExplicitAny in command files The SDK's Zod-inferred types have strict required fields (from .default()) that don't match partial CLI arg construction. Allow `as any` in src/commands/ where we bridge string args to the SDK. Co-Authored-By: Claude Opus 4.6 (1M context) --- biome.json | 2 +- src/commands/crawl.ts | 5 ++--- src/commands/extract.ts | 5 ++--- src/commands/markdownify.ts | 5 ++--- src/commands/scrape.ts | 5 ++--- src/commands/search.ts | 5 ++--- 6 files changed, 11 insertions(+), 16 deletions(-) diff --git a/biome.json b/biome.json index cf09862..c73fbd1 100644 --- a/biome.json +++ b/biome.json @@ -16,7 +16,7 @@ }, "overrides": [ { - "include": ["tests/**"], + "include": ["tests/**", "src/commands/**"], "linter": { "rules": { "suspicious": { diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index be6c00b..55db716 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,5 +1,4 @@ import { defineCommand } from "citty"; -import type { ApiCrawlOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -39,7 +38,7 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); const sgai = await createClient(!!args.json); - const crawlOptions: ApiCrawlOptions = {}; + const crawlOptions: Record = {}; if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]); if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]); if (args["max-links-per-page"]) @@ -54,7 +53,7 @@ export default defineCommand({ out.start("Crawling"); const t0 = performance.now(); try { - const job = await sgai.crawl.start(args.url, crawlOptions); + const job = await sgai.crawl.start(args.url, crawlOptions as any); const jobId = (job.data as { id: string }).id; if (!jobId) { diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 7ada900..ec3490d 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -1,5 +1,4 @@ import { defineCommand } from "citty"; -import type { ApiExtractOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -42,14 +41,14 @@ export default defineCommand({ if (args.headers) fetchConfig.headers = JSON.parse(args.headers); if (args.country) fetchConfig.country = args.country; - const extractOptions: ApiExtractOptions = { prompt: args.prompt }; + const extractOptions: Record = { prompt: args.prompt }; if (args.schema) extractOptions.schema = JSON.parse(args.schema); if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig; out.start("Extracting"); const t0 = performance.now(); try { - const result = await sgai.extract(args.url, extractOptions); + const result = await sgai.extract(args.url, extractOptions as any); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index ee99885..1de3a99 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,5 +1,4 @@ import { defineCommand } from "citty"; -import type { ApiScrapeOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -29,13 +28,13 @@ export default defineCommand({ if (args.stealth) fetchConfig.stealth = true; if (args.headers) fetchConfig.headers = JSON.parse(args.headers); - const scrapeOptions: ApiScrapeOptions = { format: "markdown" }; + const scrapeOptions: Record = { format: "markdown" }; if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Converting to markdown"); const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions); + const result = await sgai.scrape(args.url, scrapeOptions as any); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index a5c7bf7..0f7e4f6 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,5 +1,4 @@ import { defineCommand } from "citty"; -import type { ApiScrapeOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -106,13 +105,13 @@ export default defineCommand({ } }); - const scrapeOptions: ApiScrapeOptions = { formats }; + const scrapeOptions: Record = { formats }; if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; out.start("Scraping"); const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions); + const result = await sgai.scrape(args.url, scrapeOptions as any); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { diff --git a/src/commands/search.ts b/src/commands/search.ts index b1adbaa..7d39afd 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -1,5 +1,4 @@ import { defineCommand } from "citty"; -import type { ApiSearchOptions } from "scrapegraph-js"; import { createClient } from "../lib/client.js"; import * as log from "../lib/log.js"; @@ -49,7 +48,7 @@ export default defineCommand({ out.docs("https://docs.scrapegraphai.com/api-reference/search"); const sgai = await createClient(!!args.json); - const searchOptions: ApiSearchOptions = {}; + const searchOptions: Record = {}; if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); if (args.schema) searchOptions.schema = JSON.parse(args.schema); if (args.prompt) searchOptions.prompt = args.prompt; @@ -62,7 +61,7 @@ export default defineCommand({ out.start("Searching"); const t0 = performance.now(); try { - const result = await sgai.search(args.query, searchOptions); + const result = await sgai.search(args.query, searchOptions as any); out.stop(Math.round(performance.now() - t0)); out.result(result.data); } catch (err) { From c1c864284f29de3b5bdcc9faafcd13276b273196 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 13:21:10 +0200 Subject: [PATCH 08/11] fix: remove incorrect --nationality flag, use --location-geo-code only Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 1 - src/commands/search.ts | 5 ----- 2 files changed, 6 deletions(-) diff --git a/README.md b/README.md index 06d223d..18c1444 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,6 @@ just-scrape search --schema # Enforce output s just-scrape search --location-geo-code # Geo-target search (e.g. 'us', 'de', 'jp-tk') just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year just-scrape search --format # Result format (default markdown) -just-scrape search --nationality # 2-letter ISO nationality code just-scrape search --headers ``` diff --git a/src/commands/search.ts b/src/commands/search.ts index 7d39afd..1d4a51c 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -36,10 +36,6 @@ export default defineCommand({ type: "string", description: "Result format: markdown (default) or html", }, - nationality: { - type: "string", - description: "2-letter ISO nationality code for search personalization", - }, headers: { type: "string", description: "Custom headers as JSON object string" }, json: { type: "boolean", description: "Output raw JSON (pipeable)" }, }, @@ -55,7 +51,6 @@ export default defineCommand({ if (args["location-geo-code"]) searchOptions.locationGeoCode = args["location-geo-code"]; if (args["time-range"]) searchOptions.timeRange = args["time-range"]; if (args.format) searchOptions.format = args.format; - if (args.nationality) searchOptions.nationality = args.nationality; if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; out.start("Searching"); From 9f98553f98217e4f7f9a223380daaa30d1170a94 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 13:30:19 +0200 Subject: [PATCH 09/11] refactor: rename --location-geo-code to --country in search Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 6 +++--- src/commands/search.ts | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 18c1444..fd3816e 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ just-scrape search # AI-powered web s just-scrape search --num-results # Sources to scrape (1-20, default 3) just-scrape search -p # Extraction prompt for results just-scrape search --schema # Enforce output schema -just-scrape search --location-geo-code # Geo-target search (e.g. 'us', 'de', 'jp-tk') +just-scrape search --country # Geo-target search (e.g. 'us', 'de', 'jp') just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year just-scrape search --format # Result format (default markdown) just-scrape search --headers @@ -149,7 +149,7 @@ just-scrape search --headers just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10 # Recent news only, scoped to Germany -just-scrape search "EU AI act latest news" --time-range past_week --location-geo-code de +just-scrape search "EU AI act latest news" --time-range past_week --country de # Structured output with schema just-scrape search "Top 5 cloud providers pricing" \ @@ -302,7 +302,7 @@ Commands have been renamed to match the v2 API: | `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | | `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` | | `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external`, `--format` | -| `search` | `search` | New options: `--location-geo-code`, `--time-range`, `--format` | +| `search` | `search` | New options: `--country`, `--time-range`, `--format` | | `--stealth` flag | `--stealth` | Separate boolean flag; fetch mode is now `auto`, `fast`, or `js` | | `agentic-scraper` | β€” | Removed from API | | `generate-schema` | β€” | Removed from API | diff --git a/src/commands/search.ts b/src/commands/search.ts index 1d4a51c..c6f5ed0 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -23,9 +23,9 @@ export default defineCommand({ description: "Number of websites to scrape (1-20, default 3)", }, schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - "location-geo-code": { + country: { type: "string", - description: "Geo-location code for search (e.g. 'us', 'de', 'jp-tk')", + description: "Country code for geo-targeted search (e.g. 'us', 'de', 'jp')", }, "time-range": { type: "string", @@ -48,7 +48,7 @@ export default defineCommand({ if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); if (args.schema) searchOptions.schema = JSON.parse(args.schema); if (args.prompt) searchOptions.prompt = args.prompt; - if (args["location-geo-code"]) searchOptions.locationGeoCode = args["location-geo-code"]; + if (args.country) searchOptions.country = args.country; if (args["time-range"]) searchOptions.timeRange = args["time-range"]; if (args.format) searchOptions.format = args.format; if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; From d5211ae07591d25024666dacc3c114dbe60b7248 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 15:10:41 +0200 Subject: [PATCH 10/11] feat: align CLI with scrapegraph-js v2 PR #13 (0738786) - Replace SDK factory with raw function imports (scrape, extract, search, crawl, monitor, history, getCredits) - Add monitor command (create, list, get, update, delete, pause, resume) - Update crawl to use formats array and crawl.get instead of crawl.status - Update history to use history.list/history.get with new pagination response - Update search to pass query in params, remove nationality flag - Update extract to pass url in params - Make history service filter optional - Update README with monitor docs and v2 migration notes Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 95 ++++++++++---- bun.lock | 6 +- package.json | 2 +- src/cli.ts | 1 + src/commands/crawl.ts | 58 +++++---- src/commands/credits.ts | 12 +- src/commands/extract.ts | 25 ++-- src/commands/history.ts | 75 ++++++----- src/commands/markdownify.ts | 19 +-- src/commands/monitor.ts | 243 ++++++++++++++++++++++++++++++++++++ src/commands/scrape.ts | 18 +-- src/commands/search.ts | 28 ++--- src/lib/client.ts | 15 +-- 13 files changed, 452 insertions(+), 145 deletions(-) create mode 100644 src/commands/monitor.ts diff --git a/README.md b/README.md index fd3816e..64a3547 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Made with love by the [ScrapeGraphAI team](https://scrapegraphai.com) πŸ’œ ![Demo Video](/assets/demo.gif) -Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. Uses the **v2 API**. +Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, crawling, and monitoring. Uses the **v2 API**. ## Project Structure @@ -14,7 +14,7 @@ just-scrape/ β”œβ”€β”€ src/ β”‚ β”œβ”€β”€ cli.ts # Entry point, citty main command + subcommands β”‚ β”œβ”€β”€ lib/ -β”‚ β”‚ β”œβ”€β”€ client.ts # ScrapeGraphAI v2 client factory +β”‚ β”‚ β”œβ”€β”€ client.ts # API key resolver β”‚ β”‚ β”œβ”€β”€ env.ts # Env config (API key, JUST_SCRAPE_* β†’ SGAI_* bridge) β”‚ β”‚ β”œβ”€β”€ folders.ts # API key resolution + interactive prompt β”‚ β”‚ └── log.ts # Logger factory + syntax-highlighted JSON output @@ -24,6 +24,7 @@ just-scrape/ β”‚ β”‚ β”œβ”€β”€ scrape.ts β”‚ β”‚ β”œβ”€β”€ markdownify.ts β”‚ β”‚ β”œβ”€β”€ crawl.ts +β”‚ β”‚ β”œβ”€β”€ monitor.ts β”‚ β”‚ β”œβ”€β”€ history.ts β”‚ β”‚ └── credits.ts β”‚ └── utils/ @@ -87,22 +88,23 @@ Legacy variables (`JUST_SCRAPE_API_URL`, `JUST_SCRAPE_TIMEOUT_S`, `JUST_SCRAPE_D All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed β€” only minified JSON on stdout (saves tokens when piped to AI agents). ```bash -just-scrape credits --json | jq '.remainingCredits' +just-scrape credits --json | jq '.remaining' just-scrape extract https://example.com -p "Extract data" --json > result.json -just-scrape history extract --json | jq '.[].status' +just-scrape history scrape --json | jq '.[].status' ``` --- ## Extract -Extract structured data from any URL using AI (replaces `smart-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/extract) +Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/api-reference/extract) ### Usage ```bash just-scrape extract -p # Extract data with AI just-scrape extract -p --schema # Enforce output schema +just-scrape extract -p --mode # HTML mode: normal, reader, prune just-scrape extract -p --scrolls # Infinite scroll (0-100) just-scrape extract -p --mode js --stealth # Anti-bot bypass just-scrape extract -p --cookies --headers @@ -122,12 +124,12 @@ just-scrape extract https://news.example.com -p "Get all article headlines and d # Scrape a JS-heavy SPA with stealth mode just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ - --mode js --stealth + --stealth ``` ## Search -Search the web and extract structured data from results (replaces `search-scraper`). [docs](https://docs.scrapegraphai.com/api-reference/search) +Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/api-reference/search) ### Usage @@ -135,8 +137,8 @@ Search the web and extract structured data from results (replaces `search-scrape just-scrape search # AI-powered web search just-scrape search --num-results # Sources to scrape (1-20, default 3) just-scrape search -p # Extraction prompt for results -just-scrape search --schema # Enforce output schema -just-scrape search --country # Geo-target search (e.g. 'us', 'de', 'jp') +just-scrape search --schema # Enforce output schema (requires -p) +just-scrape search --country # Geo-target search (e.g. 'us', 'de', 'jp') just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year just-scrape search --format # Result format (default markdown) just-scrape search --headers @@ -153,6 +155,7 @@ just-scrape search "EU AI act latest news" --time-range past_week --country de # Structured output with schema just-scrape search "Top 5 cloud providers pricing" \ + -p "Extract provider name and free tier details" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` @@ -219,12 +222,12 @@ just-scrape markdownify https://blog.example.com/my-article just-scrape markdownify https://protected.example.com -m js --stealth # Pipe markdown to a file -just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md +just-scrape markdownify https://docs.example.com/api --json | jq -r '.results.markdown.data[0]' > api-docs.md ``` ## Crawl -Crawl multiple pages. The CLI starts the crawl and polls until completion. [docs](https://docs.scrapegraphai.com/api-reference/crawl) +Crawl multiple pages. The CLI starts the crawl and polls until completion. Supports the same format options as scrape. [docs](https://docs.scrapegraphai.com/api-reference/crawl) ### Usage @@ -235,6 +238,7 @@ just-scrape crawl --max-depth # Crawl depth (default 2) just-scrape crawl --max-links-per-page # Links per page (default 10) just-scrape crawl --allow-external # Allow external domains just-scrape crawl -f html # Page format (default markdown) +just-scrape crawl -f markdown,links # Multi-format (comma-separated) just-scrape crawl -m js --stealth # Anti-bot bypass ``` @@ -251,21 +255,63 @@ just-scrape crawl https://example.com --max-pages 50 just-scrape crawl https://example.com --json --max-pages 10 ``` +## Monitor + +Create and manage page-change monitors. Monitors periodically scrape a URL and detect changes. [docs](https://docs.scrapegraphai.com/api-reference/monitor) + +### Usage + +```bash +just-scrape monitor create --url --interval # Create a monitor +just-scrape monitor create --url --interval 1h --name "My Monitor" +just-scrape monitor create --url --interval 30m --webhook-url +just-scrape monitor create --url --interval 1d -f markdown,screenshot +just-scrape monitor list # List all monitors +just-scrape monitor get --id # Get monitor details +just-scrape monitor update --id --interval 2h # Update interval +just-scrape monitor pause --id # Pause a monitor +just-scrape monitor resume --id # Resume a paused monitor +just-scrape monitor delete --id # Delete a monitor +``` + +### Examples + +```bash +# Monitor a pricing page every hour +just-scrape monitor create --url https://store.example.com/pricing --interval 1h + +# Monitor with webhook notification +just-scrape monitor create --url https://example.com \ + --interval 30m --webhook-url https://hooks.example.com/notify + +# Monitor markdown + screenshot changes daily +just-scrape monitor create --url https://example.com \ + --interval 1d -f markdown,screenshot --name "Daily check" + +# List all monitors +just-scrape monitor list + +# Pause and resume +just-scrape monitor pause --id abc123 +just-scrape monitor resume --id abc123 +``` + ## History -Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for pagination. +Browse request history. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for pagination. Service filter is optional. ### Usage ```bash -just-scrape history # Interactive browser -just-scrape history # Fetch specific request -just-scrape history --page # Start from page (default 1) -just-scrape history --page-size # Results per page (default 20, max 100) -just-scrape history --json # Raw JSON (pipeable) +just-scrape history # All history (interactive) +just-scrape history # Filter by service +just-scrape history # Fetch specific request by ID +just-scrape history --page # Start from page (default 1) +just-scrape history --page-size # Results per page (default 20, max 100) +just-scrape history --json # Raw JSON (pipeable) ``` -Services: `scrape`, `extract`, `search`, `monitor`, `crawl` +Services: `scrape`, `extract`, `schema`, `search`, `monitor`, `crawl` ### Examples @@ -274,10 +320,10 @@ Services: `scrape`, `extract`, `search`, `monitor`, `crawl` just-scrape history extract # Jump to a specific request by ID -just-scrape history extract abc123-def456-7890 +just-scrape history scrape abc123-def456-7890 -# Export crawl history as JSON -just-scrape history crawl --json --page-size 100 | jq '.[].status' +# Export all history as JSON +just-scrape history --json --page-size 100 | jq '.[].status' ``` ## Credits @@ -286,7 +332,7 @@ Check your credit balance. ```bash just-scrape credits -just-scrape credits --json | jq '.remainingCredits' +just-scrape credits --json | jq '.remaining' ``` --- @@ -301,11 +347,12 @@ Commands have been renamed to match the v2 API: | `search-scraper` | `search` | Renamed | | `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | | `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` | -| `crawl` | `crawl` | New options: `--max-depth`, `--max-links-per-page`, `--allow-external`, `--format` | +| `crawl` | `crawl` | Now uses `formats` array like scrape, supports multi-format | | `search` | `search` | New options: `--country`, `--time-range`, `--format` | +| β€” | `monitor` | **New**: create, list, get, update, delete, pause, resume page-change monitors | | `--stealth` flag | `--stealth` | Separate boolean flag; fetch mode is now `auto`, `fast`, or `js` | | `agentic-scraper` | β€” | Removed from API | -| `generate-schema` | β€” | Removed from API | +| `generate-schema` | β€” | Removed from CLI (still available in SDK) | | `sitemap` | β€” | Removed from API | | `validate` | β€” | Removed from API | diff --git a/bun.lock b/bun.lock index 12fae36..7fbc6f0 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#016ae8b", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#0738786", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#016ae8b", { "peerDependencies": { "zod": "^3.0.0 || ^4.0.0" }, "optionalPeers": ["zod"] }, "ScrapeGraphAI-scrapegraph-js-016ae8b"], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#0738786", { "dependencies": { "zod": "^4.3.6" } }, "ScrapeGraphAI-scrapegraph-js-0738786"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], @@ -256,5 +256,7 @@ "ufo": ["ufo@1.6.3", "", {}, "sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q=="], "undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], } } diff --git a/package.json b/package.json index 0151389..c2b17c5 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#016ae8b" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#0738786" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/src/cli.ts b/src/cli.ts index 255e93a..8591f97 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -17,6 +17,7 @@ const main = defineCommand({ scrape: () => import("./commands/scrape.js").then((m) => m.default), markdownify: () => import("./commands/markdownify.js").then((m) => m.default), crawl: () => import("./commands/crawl.js").then((m) => m.default), + monitor: () => import("./commands/monitor.js").then((m) => m.default), history: () => import("./commands/history.js").then((m) => m.default), credits: () => import("./commands/credits.js").then((m) => m.default), }, diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index 55db716..bfca8f0 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { crawl } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; const POLL_INTERVAL_MS = 3000; @@ -23,7 +24,7 @@ export default defineCommand({ type: "string", alias: "f", description: - "Page format: markdown (default), html, screenshot, branding, links, images, summary", + "Page format: markdown (default), html, screenshot, branding, links, images, summary. Comma-separate for multi-format.", }, mode: { type: "string", @@ -36,51 +37,58 @@ export default defineCommand({ run: async ({ args }) => { const out = log.create(!!args.json); out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); + + const requestedFormats = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean); + + const formats = requestedFormats.map((f) => { + if (f === "markdown" || f === "html") return { type: f as "markdown" | "html", mode: "normal" as const }; + return { type: f }; + }); + + const params: Record = { url: args.url, formats }; + if (args["max-pages"]) params.maxPages = Number(args["max-pages"]); + if (args["max-depth"]) params.maxDepth = Number(args["max-depth"]); + if (args["max-links-per-page"]) params.maxLinksPerPage = Number(args["max-links-per-page"]); + if (args["allow-external"]) params.allowExternal = true; - const crawlOptions: Record = {}; - if (args["max-pages"]) crawlOptions.maxPages = Number(args["max-pages"]); - if (args["max-depth"]) crawlOptions.maxDepth = Number(args["max-depth"]); - if (args["max-links-per-page"]) - crawlOptions.maxLinksPerPage = Number(args["max-links-per-page"]); - if (args["allow-external"]) crawlOptions.allowExternal = true; - if (args.format) crawlOptions.format = args.format; const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; if (args.stealth) fetchConfig.stealth = true; - if (Object.keys(fetchConfig).length > 0) crawlOptions.fetchConfig = fetchConfig; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; out.start("Crawling"); - const t0 = performance.now(); try { - const job = await sgai.crawl.start(args.url, crawlOptions as any); - const jobId = (job.data as { id: string }).id; + const job = await crawl.start(apiKey, params as any); + const jobData = job.data as { id: string; status: string } | null; - if (!jobId) { - out.stop(Math.round(performance.now() - t0)); + if (!jobData?.id) { + out.stop(job.elapsedMs); out.result(job.data); return; } - // Poll until the crawl completes while (true) { await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); - const status = await sgai.crawl.status(jobId); - const statusData = status.data as { status: string; [key: string]: unknown }; - out.poll(statusData.status); + const status = await crawl.get(apiKey, jobData.id); + const statusData = status.data as { status: string; [key: string]: unknown } | null; + out.poll(statusData?.status ?? "unknown"); if ( - statusData.status === "completed" || - statusData.status === "failed" || - statusData.status === "cancelled" + statusData?.status === "completed" || + statusData?.status === "failed" || + statusData?.status === "deleted" ) { - out.stop(Math.round(performance.now() - t0)); + out.stop(job.elapsedMs + status.elapsedMs); out.result(status.data); return; } } } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/commands/credits.ts b/src/commands/credits.ts index 457e27d..52375d7 100644 --- a/src/commands/credits.ts +++ b/src/commands/credits.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { getCredits } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -12,16 +13,15 @@ export default defineCommand({ }, run: async ({ args }) => { const out = log.create(!!args.json); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); out.start("Fetching credits"); - const t0 = performance.now(); try { - const result = await sgai.credits(); - out.stop(Math.round(performance.now() - t0)); + const result = await getCredits(apiKey); + out.stop(result.elapsedMs); out.result(result.data); } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/commands/extract.ts b/src/commands/extract.ts index ec3490d..513fa29 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { extract } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -20,8 +21,8 @@ export default defineCommand({ required: true, }, schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + mode: { type: "string", description: "HTML processing mode: normal (default), reader, prune" }, scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - mode: { type: "string", description: "Fetch mode: auto (default), fast, js" }, stealth: { type: "boolean", description: "Enable stealth mode" }, cookies: { type: "string", description: "Cookies as JSON object string" }, headers: { type: "string", description: "Custom headers as JSON object string" }, @@ -31,28 +32,30 @@ export default defineCommand({ run: async ({ args }) => { const out = log.create(!!args.json); out.docs("https://docs.scrapegraphai.com/api-reference/extract"); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); const fetchConfig: Record = {}; if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); - if (args.mode) fetchConfig.mode = args.mode; if (args.stealth) fetchConfig.stealth = true; if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); if (args.headers) fetchConfig.headers = JSON.parse(args.headers); if (args.country) fetchConfig.country = args.country; - const extractOptions: Record = { prompt: args.prompt }; - if (args.schema) extractOptions.schema = JSON.parse(args.schema); - if (Object.keys(fetchConfig).length > 0) extractOptions.fetchConfig = fetchConfig; + const params: Record = { + url: args.url, + prompt: args.prompt, + }; + if (args.schema) params.schema = JSON.parse(args.schema); + if (args.mode) params.mode = args.mode; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; out.start("Extracting"); - const t0 = performance.now(); try { - const result = await sgai.extract(args.url, extractOptions as any); - out.stop(Math.round(performance.now() - t0)); + const result = await extract(apiKey, params as any); + out.stop(result.elapsedMs); out.result(result.data); } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/commands/history.ts b/src/commands/history.ts index d9a19a1..fd70377 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -1,26 +1,31 @@ import * as p from "@clack/prompts"; import chalk from "chalk"; import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { history } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; -const HISTORY_SERVICES = ["scrape", "extract", "search", "monitor", "crawl"] as const; +const HISTORY_SERVICES = ["scrape", "extract", "schema", "search", "monitor", "crawl"] as const; const VALID = HISTORY_SERVICES.join(", "); const LOAD_MORE = "__load_more__"; -function getId(row: Record): string { - return String(row.request_id ?? row.crawl_id ?? row.id ?? "unknown"); +type HistoryRow = Record; + +function getId(row: HistoryRow): string { + return String(row.id ?? "unknown"); } -function label(row: Record): string { +function label(row: HistoryRow): string { const id = getId(row); - const short = id.length > 12 ? `${id.slice(0, 12)}…` : id; + const short = id.length > 12 ? `${id.slice(0, 12)}...` : id; const status = String(row.status ?? "β€”"); - const url = String(row.website_url ?? row.url ?? row.user_prompt ?? ""); - const urlShort = url.length > 50 ? `${url.slice(0, 49)}…` : url; + + const params = row.params as Record | undefined; + const url = String(params?.url ?? params?.query ?? ""); + const urlShort = url.length > 50 ? `${url.slice(0, 49)}...` : url; const color = - status === "completed" || status === "done" + status === "completed" ? chalk.green : status === "failed" ? chalk.red @@ -29,8 +34,8 @@ function label(row: Record): string { return `${chalk.dim(short)} ${color(status)} ${urlShort}`; } -function hint(row: Record): string { - const ts = row.created_at ?? row.timestamp ?? row.updated_at; +function hint(row: HistoryRow): string { + const ts = row.createdAt; if (!ts) return ""; const d = new Date(String(ts)); return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString(); @@ -45,7 +50,7 @@ export default defineCommand({ service: { type: "positional", description: `Service name (${VALID})`, - required: true, + required: false, }, page: { type: "string", description: "Page number (default: 1)" }, "page-size": { type: "string", description: "Results per page (default: 20, max: 100)" }, @@ -54,38 +59,32 @@ export default defineCommand({ run: async ({ args }) => { const quiet = !!args.json; const out = log.create(quiet); - const sgai = await createClient(quiet); - const service = args.service as (typeof HISTORY_SERVICES)[number]; + const apiKey = await getApiKey(quiet); + const service = args.service as (typeof HISTORY_SERVICES)[number] | undefined; const requestId = (args as { _: string[] })._.at(1); const limit = args["page-size"] ? Number(args["page-size"]) : 20; let page = args.page ? Number(args.page) : 1; const fetchPage = async (pg: number) => { - const t0 = performance.now(); - const r = await sgai.history({ service, page: pg, limit }); - const ms = Math.round(performance.now() - t0); + const params: Record = { page: pg, limit }; + if (service) params.service = service; + const r = await history.list(apiKey, params as any); const d = r.data as { - data?: Record[]; - requests?: Record[]; - next_key?: string; - total?: number; - }; + data?: HistoryRow[]; + pagination?: { page: number; limit: number; total: number }; + } | null; + const rows = d?.data ?? []; + const total = d?.pagination?.total ?? 0; return { - rows: d.data ?? d.requests ?? [], - hasMore: !!d.next_key || (d.total != null && pg * limit < d.total), - ms, + rows, + hasMore: total > pg * limit, + ms: r.elapsedMs, }; }; - if (quiet || requestId) { + if (quiet && !requestId) { try { const { rows } = await fetchPage(page); - if (requestId) { - const match = rows.find((r) => getId(r) === requestId); - if (!match) out.error(`Request ${requestId} not found on page ${page}`); - out.result(match); - return; - } out.result(rows); } catch (err) { out.error(err instanceof Error ? err.message : String(err)); @@ -93,7 +92,17 @@ export default defineCommand({ return; } - out.start(`Fetching ${service} history`); + if (requestId) { + try { + const result = await history.get(apiKey, requestId); + out.result(result.data); + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); + } + return; + } + + out.start(`Fetching ${service ?? "all"} history`); try { const first = await fetchPage(page); out.stop(first.ms); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts index 1de3a99..e95a16f 100644 --- a/src/commands/markdownify.ts +++ b/src/commands/markdownify.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { scrape } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -21,24 +22,26 @@ export default defineCommand({ run: async ({ args }) => { const out = log.create(!!args.json); out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; if (args.stealth) fetchConfig.stealth = true; if (args.headers) fetchConfig.headers = JSON.parse(args.headers); - const scrapeOptions: Record = { format: "markdown" }; - if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; + const params: Record = { + url: args.url, + formats: [{ type: "markdown", mode: "normal" }], + }; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; out.start("Converting to markdown"); - const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions as any); - out.stop(Math.round(performance.now() - t0)); + const result = await scrape(apiKey, params as any); + out.stop(result.elapsedMs); out.result(result.data); } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/commands/monitor.ts b/src/commands/monitor.ts new file mode 100644 index 0000000..d730bd3 --- /dev/null +++ b/src/commands/monitor.ts @@ -0,0 +1,243 @@ +import * as p from "@clack/prompts"; +import chalk from "chalk"; +import { defineCommand } from "citty"; +import { monitor } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; +import * as log from "../lib/log.js"; + +const ACTIONS = ["create", "list", "get", "update", "delete", "pause", "resume"] as const; +type Action = (typeof ACTIONS)[number]; + +const FORMATS = ["markdown", "html", "screenshot", "branding", "links", "images", "summary", "json"] as const; + +export default defineCommand({ + meta: { + name: "monitor", + description: "Create and manage page-change monitors", + }, + args: { + action: { + type: "positional", + description: `Action: ${ACTIONS.join(", ")}`, + required: true, + }, + url: { + type: "string", + description: "URL to monitor (for create)", + }, + id: { + type: "string", + description: "Monitor ID (for get, update, delete, pause, resume)", + }, + name: { + type: "string", + description: "Monitor name", + }, + interval: { + type: "string", + description: "Check interval (e.g. '1h', '30m', '1d') β€” required for create", + }, + format: { + type: "string", + alias: "f", + description: `Formats to track: ${FORMATS.join(", ")} (default: markdown). Comma-separate for multi-format.`, + }, + "webhook-url": { + type: "string", + description: "Webhook URL to notify on changes", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + const apiKey = await getApiKey(!!args.json); + const action = args.action as Action; + + switch (action) { + case "create": { + if (!args.url) { + out.error("--url is required for create"); + return; + } + if (!args.interval) { + out.error("--interval is required for create"); + return; + } + + const requestedFormats = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean); + + const formats = requestedFormats.map((f) => { + if (f === "markdown" || f === "html") return { type: f as "markdown" | "html", mode: "normal" as const }; + return { type: f }; + }); + + const params: Record = { + url: args.url, + interval: args.interval, + formats, + }; + if (args.name) params.name = args.name; + if (args["webhook-url"]) params.webhookUrl = args["webhook-url"]; + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Creating monitor"); + try { + const result = await monitor.create(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "list": { + out.start("Fetching monitors"); + try { + const result = await monitor.list(apiKey); + out.stop(result.elapsedMs); + + if (args.json) { + out.result(result.data); + return; + } + + const monitors = result.data as Array> | null; + if (!monitors?.length) { + p.log.warning("No monitors found."); + return; + } + + for (const m of monitors) { + const status = String(m.status ?? ""); + const color = status === "active" ? chalk.green : chalk.yellow; + p.log.info( + `${chalk.dim(String(m.cronId ?? m.scheduleId ?? ""))} ${color(status)} ${String((m.config as Record)?.url ?? "")} ${chalk.dim(String(m.interval ?? ""))}`, + ); + } + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "get": { + if (!args.id) { + out.error("--id is required for get"); + return; + } + out.start("Fetching monitor"); + try { + const result = await monitor.get(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "update": { + if (!args.id) { + out.error("--id is required for update"); + return; + } + const params: Record = {}; + if (args.name) params.name = args.name; + if (args.interval) params.interval = args.interval; + if (args["webhook-url"]) params.webhookUrl = args["webhook-url"]; + if (args.format) { + params.formats = args.format + .split(",") + .map((f) => f.trim()) + .filter(Boolean) + .map((f) => { + if (f === "markdown" || f === "html") return { type: f, mode: "normal" as const }; + return { type: f }; + }); + } + + out.start("Updating monitor"); + try { + const result = await monitor.update(apiKey, args.id, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "delete": { + if (!args.id) { + out.error("--id is required for delete"); + return; + } + out.start("Deleting monitor"); + try { + const result = await monitor.delete(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "pause": { + if (!args.id) { + out.error("--id is required for pause"); + return; + } + out.start("Pausing monitor"); + try { + const result = await monitor.pause(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "resume": { + if (!args.id) { + out.error("--id is required for resume"); + return; + } + out.start("Resuming monitor"); + try { + const result = await monitor.resume(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + default: + out.error(`Unknown action: ${action}. Valid: ${ACTIONS.join(", ")}`); + } + }, +}); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index 0f7e4f6..310b81a 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { scrape } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; const FORMATS = [ @@ -57,7 +58,7 @@ export default defineCommand({ run: async ({ args }) => { const out = log.create(!!args.json); out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); const fetchConfig: Record = {}; if (args.mode) fetchConfig.mode = args.mode; @@ -90,7 +91,7 @@ export default defineCommand({ case "json": { if (!args.prompt) { out.error("--prompt is required when --format includes json"); - return { type: "json" as const }; + return { type: "json" as const, prompt: "" }; } return { type: "json" as const, @@ -105,17 +106,16 @@ export default defineCommand({ } }); - const scrapeOptions: Record = { formats }; - if (Object.keys(fetchConfig).length > 0) scrapeOptions.fetchConfig = fetchConfig; + const params: Record = { url: args.url, formats }; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; out.start("Scraping"); - const t0 = performance.now(); try { - const result = await sgai.scrape(args.url, scrapeOptions as any); - out.stop(Math.round(performance.now() - t0)); + const result = await scrape(apiKey, params as any); + out.stop(result.elapsedMs); out.result(result.data); } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/commands/search.ts b/src/commands/search.ts index c6f5ed0..9bc113e 100644 --- a/src/commands/search.ts +++ b/src/commands/search.ts @@ -1,5 +1,6 @@ import { defineCommand } from "citty"; -import { createClient } from "../lib/client.js"; +import { search } from "scrapegraph-js"; +import { getApiKey } from "../lib/client.js"; import * as log from "../lib/log.js"; export default defineCommand({ @@ -42,25 +43,24 @@ export default defineCommand({ run: async ({ args }) => { const out = log.create(!!args.json); out.docs("https://docs.scrapegraphai.com/api-reference/search"); - const sgai = await createClient(!!args.json); + const apiKey = await getApiKey(!!args.json); - const searchOptions: Record = {}; - if (args["num-results"]) searchOptions.numResults = Number(args["num-results"]); - if (args.schema) searchOptions.schema = JSON.parse(args.schema); - if (args.prompt) searchOptions.prompt = args.prompt; - if (args.country) searchOptions.country = args.country; - if (args["time-range"]) searchOptions.timeRange = args["time-range"]; - if (args.format) searchOptions.format = args.format; - if (args.headers) searchOptions.fetchConfig = { headers: JSON.parse(args.headers) }; + const params: Record = { query: args.query }; + if (args["num-results"]) params.numResults = Number(args["num-results"]); + if (args.schema) params.schema = JSON.parse(args.schema); + if (args.prompt) params.prompt = args.prompt; + if (args.country) params.country = args.country; + if (args["time-range"]) params.timeRange = args["time-range"]; + if (args.format) params.format = args.format; + if (args.headers) params.fetchConfig = { headers: JSON.parse(args.headers) }; out.start("Searching"); - const t0 = performance.now(); try { - const result = await sgai.search(args.query, searchOptions as any); - out.stop(Math.round(performance.now() - t0)); + const result = await search(apiKey, params as any); + out.stop(result.elapsedMs); out.result(result.data); } catch (err) { - out.stop(Math.round(performance.now() - t0)); + out.stop(0); out.error(err instanceof Error ? err.message : String(err)); } }, diff --git a/src/lib/client.ts b/src/lib/client.ts index a83df1c..dc1b11b 100644 --- a/src/lib/client.ts +++ b/src/lib/client.ts @@ -1,18 +1,9 @@ -import { scrapegraphai } from "scrapegraph-js"; import { resolveApiKey } from "./folders.js"; -let cached: ReturnType | null = null; - -export async function createClient(quiet = false) { - const apiKey = await resolveApiKey(quiet); +let cached: string | null = null; +export async function getApiKey(quiet = false): Promise { if (cached) return cached; - - const baseUrl = process.env.SGAI_API_URL || undefined; - const timeout = process.env.SGAI_TIMEOUT_S - ? Number(process.env.SGAI_TIMEOUT_S) * 1000 - : undefined; - - cached = scrapegraphai({ apiKey, baseUrl, timeout }); + cached = await resolveApiKey(quiet); return cached; } From 480ac2e1cd1df23434d413db36af506ee03486a1 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 14 Apr 2026 16:04:58 +0200 Subject: [PATCH 11/11] fix(ci): biome formatting and update smoke test to use actual SDK exports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smoke test was importing a `scrapegraphai` factory function that doesn't exist in the SDK β€” replaced with imports of the actual named exports (scrape, extract, search, crawl, history, monitor, getCredits). Fixed biome formatting in crawl.ts, history.ts, and monitor.ts. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/commands/crawl.ts | 3 ++- src/commands/history.ts | 6 +----- src/commands/monitor.ts | 14 ++++++++++++-- tests/smoke.test.ts | 24 ++++++++++++------------ 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts index bfca8f0..4726a88 100644 --- a/src/commands/crawl.ts +++ b/src/commands/crawl.ts @@ -45,7 +45,8 @@ export default defineCommand({ .filter(Boolean); const formats = requestedFormats.map((f) => { - if (f === "markdown" || f === "html") return { type: f as "markdown" | "html", mode: "normal" as const }; + if (f === "markdown" || f === "html") + return { type: f as "markdown" | "html", mode: "normal" as const }; return { type: f }; }); diff --git a/src/commands/history.ts b/src/commands/history.ts index fd70377..3d9b3c3 100644 --- a/src/commands/history.ts +++ b/src/commands/history.ts @@ -25,11 +25,7 @@ function label(row: HistoryRow): string { const urlShort = url.length > 50 ? `${url.slice(0, 49)}...` : url; const color = - status === "completed" - ? chalk.green - : status === "failed" - ? chalk.red - : chalk.yellow; + status === "completed" ? chalk.green : status === "failed" ? chalk.red : chalk.yellow; return `${chalk.dim(short)} ${color(status)} ${urlShort}`; } diff --git a/src/commands/monitor.ts b/src/commands/monitor.ts index d730bd3..c7beb6a 100644 --- a/src/commands/monitor.ts +++ b/src/commands/monitor.ts @@ -8,7 +8,16 @@ import * as log from "../lib/log.js"; const ACTIONS = ["create", "list", "get", "update", "delete", "pause", "resume"] as const; type Action = (typeof ACTIONS)[number]; -const FORMATS = ["markdown", "html", "screenshot", "branding", "links", "images", "summary", "json"] as const; +const FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", + "json", +] as const; export default defineCommand({ meta: { @@ -76,7 +85,8 @@ export default defineCommand({ .filter(Boolean); const formats = requestedFormats.map((f) => { - if (f === "markdown" || f === "html") return { type: f as "markdown" | "html", mode: "normal" as const }; + if (f === "markdown" || f === "html") + return { type: f as "markdown" | "html", mode: "normal" as const }; return { type: f }; }); diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index 40ba725..15fc5da 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -1,15 +1,15 @@ import { expect, test } from "bun:test"; -import { scrapegraphai } from "scrapegraph-js"; +import { crawl, extract, getCredits, history, monitor, scrape, search } from "scrapegraph-js"; -test("sdk v2 factory is callable and exposes expected methods", () => { - expect(typeof scrapegraphai).toBe("function"); - - const client = scrapegraphai({ apiKey: "sgai-test" }); - expect(typeof client.scrape).toBe("function"); - expect(typeof client.extract).toBe("function"); - expect(typeof client.search).toBe("function"); - expect(typeof client.credits).toBe("function"); - expect(typeof client.history).toBe("function"); - expect(typeof client.crawl.start).toBe("function"); - expect(typeof client.crawl.status).toBe("function"); +test("sdk v2 exports expected functions", () => { + expect(typeof scrape).toBe("function"); + expect(typeof extract).toBe("function"); + expect(typeof search).toBe("function"); + expect(typeof getCredits).toBe("function"); + expect(typeof history.list).toBe("function"); + expect(typeof history.get).toBe("function"); + expect(typeof crawl.start).toBe("function"); + expect(typeof crawl.get).toBe("function"); + expect(typeof monitor.create).toBe("function"); + expect(typeof monitor.list).toBe("function"); });