|
| 1 | +import { CrwIcon } from '@/components/icons' |
| 2 | +import type { BlockConfig, BlockMeta } from '@/blocks/types' |
| 3 | +import { AuthMode, IntegrationType } from '@/blocks/types' |
| 4 | +import type { CrwResponse } from '@/tools/crw/types' |
| 5 | + |
| 6 | +export const CrwBlock: BlockConfig<CrwResponse> = { |
| 7 | + type: 'crw', |
| 8 | + name: 'fastCRW', |
| 9 | + description: 'Scrape, search, crawl, and map web data', |
| 10 | + authMode: AuthMode.ApiKey, |
| 11 | + longDescription: |
| 12 | + 'Integrate fastCRW into the workflow. Scrape pages, search the web, crawl entire sites, and map URL structures. fastCRW is a Firecrawl-compatible web scraper in a single binary — self-host or cloud.', |
| 13 | + docsLink: 'https://docs.sim.ai/integrations/crw', |
| 14 | + category: 'tools', |
| 15 | + integrationType: IntegrationType.Search, |
| 16 | + bgColor: '#181C1E', |
| 17 | + icon: CrwIcon, |
| 18 | + subBlocks: [ |
| 19 | + { |
| 20 | + id: 'operation', |
| 21 | + title: 'Operation', |
| 22 | + type: 'dropdown', |
| 23 | + options: [ |
| 24 | + { label: 'Scrape', id: 'scrape' }, |
| 25 | + { label: 'Search', id: 'search' }, |
| 26 | + { label: 'Crawl', id: 'crawl' }, |
| 27 | + { label: 'Map', id: 'map' }, |
| 28 | + ], |
| 29 | + value: () => 'scrape', |
| 30 | + }, |
| 31 | + { |
| 32 | + id: 'url', |
| 33 | + title: 'Website URL', |
| 34 | + type: 'short-input', |
| 35 | + placeholder: 'Enter the website URL', |
| 36 | + condition: { |
| 37 | + field: 'operation', |
| 38 | + value: ['scrape', 'crawl', 'map'], |
| 39 | + }, |
| 40 | + required: true, |
| 41 | + }, |
| 42 | + { |
| 43 | + id: 'query', |
| 44 | + title: 'Search Query', |
| 45 | + type: 'short-input', |
| 46 | + placeholder: 'Enter the search query', |
| 47 | + condition: { |
| 48 | + field: 'operation', |
| 49 | + value: 'search', |
| 50 | + }, |
| 51 | + required: true, |
| 52 | + }, |
| 53 | + { |
| 54 | + id: 'onlyMainContent', |
| 55 | + title: 'Only Main Content', |
| 56 | + type: 'switch', |
| 57 | + condition: { |
| 58 | + field: 'operation', |
| 59 | + value: ['scrape', 'crawl'], |
| 60 | + }, |
| 61 | + }, |
| 62 | + { |
| 63 | + id: 'formats', |
| 64 | + title: 'Output Formats', |
| 65 | + type: 'long-input', |
| 66 | + placeholder: '["markdown", "html"]', |
| 67 | + condition: { |
| 68 | + field: 'operation', |
| 69 | + value: ['scrape', 'crawl'], |
| 70 | + }, |
| 71 | + }, |
| 72 | + { |
| 73 | + id: 'waitFor', |
| 74 | + title: 'Wait For (ms)', |
| 75 | + type: 'short-input', |
| 76 | + placeholder: '0', |
| 77 | + condition: { |
| 78 | + field: 'operation', |
| 79 | + value: 'scrape', |
| 80 | + }, |
| 81 | + }, |
| 82 | + { |
| 83 | + id: 'limit', |
| 84 | + title: 'Limit', |
| 85 | + type: 'short-input', |
| 86 | + placeholder: '100', |
| 87 | + condition: { |
| 88 | + field: 'operation', |
| 89 | + value: ['map', 'search'], |
| 90 | + }, |
| 91 | + }, |
| 92 | + { |
| 93 | + id: 'maxPages', |
| 94 | + title: 'Max Pages', |
| 95 | + type: 'short-input', |
| 96 | + placeholder: '100', |
| 97 | + condition: { |
| 98 | + field: 'operation', |
| 99 | + value: 'crawl', |
| 100 | + }, |
| 101 | + }, |
| 102 | + { |
| 103 | + id: 'baseUrl', |
| 104 | + title: 'Base URL', |
| 105 | + type: 'short-input', |
| 106 | + placeholder: 'https://fastcrw.com/api', |
| 107 | + mode: 'advanced', |
| 108 | + }, |
| 109 | + { |
| 110 | + id: 'apiKey', |
| 111 | + title: 'API Key', |
| 112 | + type: 'short-input', |
| 113 | + placeholder: 'Enter your fastCRW API key', |
| 114 | + password: true, |
| 115 | + required: true, |
| 116 | + hideWhenHosted: true, |
| 117 | + }, |
| 118 | + ], |
| 119 | + tools: { |
| 120 | + access: ['crw_scrape', 'crw_search', 'crw_crawl', 'crw_map'], |
| 121 | + config: { |
| 122 | + tool: (params) => { |
| 123 | + switch (params.operation) { |
| 124 | + case 'scrape': |
| 125 | + return 'crw_scrape' |
| 126 | + case 'search': |
| 127 | + return 'crw_search' |
| 128 | + case 'crawl': |
| 129 | + return 'crw_crawl' |
| 130 | + case 'map': |
| 131 | + return 'crw_map' |
| 132 | + default: |
| 133 | + return 'crw_scrape' |
| 134 | + } |
| 135 | + }, |
| 136 | + params: (params) => { |
| 137 | + const { |
| 138 | + operation, |
| 139 | + limit, |
| 140 | + maxPages, |
| 141 | + formats, |
| 142 | + waitFor, |
| 143 | + url, |
| 144 | + query, |
| 145 | + onlyMainContent, |
| 146 | + baseUrl, |
| 147 | + apiKey, |
| 148 | + } = params |
| 149 | + |
| 150 | + const result: Record<string, any> = { apiKey } |
| 151 | + |
| 152 | + if (baseUrl) result.baseUrl = baseUrl |
| 153 | + |
| 154 | + switch (operation) { |
| 155 | + case 'scrape': |
| 156 | + if (url) result.url = url |
| 157 | + if (formats) { |
| 158 | + if (Array.isArray(formats)) { |
| 159 | + result.formats = formats |
| 160 | + } else if (typeof formats === 'string') { |
| 161 | + try { |
| 162 | + const parsed = JSON.parse(formats) |
| 163 | + result.formats = Array.isArray(parsed) ? parsed : ['markdown'] |
| 164 | + } catch { |
| 165 | + result.formats = ['markdown'] |
| 166 | + } |
| 167 | + } |
| 168 | + } |
| 169 | + if (waitFor) result.waitFor = Number.parseInt(waitFor) |
| 170 | + if (onlyMainContent != null) result.onlyMainContent = onlyMainContent |
| 171 | + break |
| 172 | + |
| 173 | + case 'search': |
| 174 | + if (query) result.query = query |
| 175 | + if (limit) result.limit = Number.parseInt(limit) |
| 176 | + break |
| 177 | + |
| 178 | + case 'crawl': |
| 179 | + if (url) result.url = url |
| 180 | + if (maxPages) result.maxPages = Number.parseInt(maxPages) |
| 181 | + if (formats) { |
| 182 | + if (Array.isArray(formats)) { |
| 183 | + result.formats = formats |
| 184 | + } else if (typeof formats === 'string') { |
| 185 | + try { |
| 186 | + const parsed = JSON.parse(formats) |
| 187 | + result.formats = Array.isArray(parsed) ? parsed : ['markdown'] |
| 188 | + } catch { |
| 189 | + result.formats = ['markdown'] |
| 190 | + } |
| 191 | + } |
| 192 | + } |
| 193 | + if (onlyMainContent != null) result.onlyMainContent = onlyMainContent |
| 194 | + break |
| 195 | + |
| 196 | + case 'map': |
| 197 | + if (url) result.url = url |
| 198 | + if (limit) result.limit = Number.parseInt(limit) |
| 199 | + break |
| 200 | + } |
| 201 | + |
| 202 | + return result |
| 203 | + }, |
| 204 | + }, |
| 205 | + }, |
| 206 | + inputs: { |
| 207 | + apiKey: { type: 'string', description: 'fastCRW API key' }, |
| 208 | + baseUrl: { type: 'string', description: 'Base URL for self-hosted fastCRW' }, |
| 209 | + operation: { type: 'string', description: 'Operation to perform' }, |
| 210 | + url: { type: 'string', description: 'Target website URL' }, |
| 211 | + query: { type: 'string', description: 'Search query terms' }, |
| 212 | + limit: { type: 'string', description: 'Result/link limit' }, |
| 213 | + maxPages: { type: 'string', description: 'Maximum pages to crawl' }, |
| 214 | + formats: { type: 'json', description: 'Output formats array' }, |
| 215 | + waitFor: { type: 'number', description: 'Wait time before scraping in ms' }, |
| 216 | + onlyMainContent: { type: 'boolean', description: 'Extract only main content' }, |
| 217 | + scrapeOptions: { type: 'json', description: 'Advanced scraping options' }, |
| 218 | + }, |
| 219 | + outputs: { |
| 220 | + // Scrape output |
| 221 | + markdown: { type: 'string', description: 'Page content markdown' }, |
| 222 | + html: { type: 'string', description: 'Raw HTML content' }, |
| 223 | + metadata: { type: 'json', description: 'Page metadata' }, |
| 224 | + // Search output |
| 225 | + data: { type: 'json', description: 'Search results data' }, |
| 226 | + // Crawl output |
| 227 | + pages: { type: 'json', description: 'Crawled pages data' }, |
| 228 | + total: { type: 'number', description: 'Total pages found' }, |
| 229 | + // Map output |
| 230 | + success: { type: 'boolean', description: 'Operation success status' }, |
| 231 | + links: { type: 'json', description: 'Discovered URLs array' }, |
| 232 | + }, |
| 233 | +} |
| 234 | + |
| 235 | +export const CrwBlockMeta = { |
| 236 | + tags: ['web-scraping', 'automation'], |
| 237 | + templates: [ |
| 238 | + { |
| 239 | + icon: CrwIcon, |
| 240 | + title: 'fastCRW competitor site monitor', |
| 241 | + prompt: |
| 242 | + 'Build a scheduled workflow that uses fastCRW to scrape competitor pricing, product, and changelog pages weekly, diffs against the prior snapshot, and posts changes to Slack.', |
| 243 | + modules: ['scheduled', 'agent', 'workflows'], |
| 244 | + category: 'marketing', |
| 245 | + tags: ['marketing', 'monitoring'], |
| 246 | + alsoIntegrations: ['slack'], |
| 247 | + }, |
| 248 | + { |
| 249 | + icon: CrwIcon, |
| 250 | + title: 'fastCRW knowledge-base builder', |
| 251 | + prompt: |
| 252 | + 'Build a workflow that crawls a documentation site with fastCRW, chunks and embeds the pages, and upserts them into a knowledge base for an answering agent.', |
| 253 | + modules: ['knowledge-base', 'agent', 'workflows'], |
| 254 | + category: 'engineering', |
| 255 | + tags: ['research', 'sync'], |
| 256 | + }, |
| 257 | + { |
| 258 | + icon: CrwIcon, |
| 259 | + title: 'fastCRW research stack', |
| 260 | + prompt: |
| 261 | + 'Create an agent that uses fastCRW Search to find authoritative URLs on a topic, scrapes each with fastCRW, and produces a structured research brief with citations.', |
| 262 | + modules: ['agent', 'files', 'workflows'], |
| 263 | + category: 'productivity', |
| 264 | + tags: ['research'], |
| 265 | + }, |
| 266 | + ], |
| 267 | + skills: [ |
| 268 | + { |
| 269 | + name: 'scrape-page-to-markdown', |
| 270 | + description: |
| 271 | + 'Scrape a single URL with fastCRW and return clean main-content markdown for an agent to read.', |
| 272 | + content: |
| 273 | + '# Scrape Page to Markdown\n\nUse fastCRW to fetch a web page as clean, LLM-ready markdown.\n\n## Steps\n1. Use the Scrape operation on the target URL.\n2. Enable Only Main Content to strip navigation, ads, and footers; set a Wait For delay if the page renders content with JavaScript.\n3. Return the markdown output and capture page metadata (title, description).\n\n## Output\nReturn the page markdown plus key metadata. If the page failed to load or returned empty content, report that instead of fabricating text.', |
| 274 | + }, |
| 275 | + { |
| 276 | + name: 'crawl-site', |
| 277 | + description: |
| 278 | + 'Crawl an entire site or section with fastCRW and return the page content for indexing or analysis.', |
| 279 | + content: |
| 280 | + '# Crawl Site\n\nUse fastCRW to traverse a site and collect its pages.\n\n## Steps\n1. Use the Crawl operation on the root URL, setting a sensible Max Pages limit to control cost.\n2. Enable Only Main Content so each page comes back as clean markdown.\n3. Collect the crawled pages and their URLs from the response.\n\n## Output\nReturn the list of crawled pages with their URL and markdown content, plus the total page count. This output is ready to chunk and embed into a knowledge base.', |
| 281 | + }, |
| 282 | + { |
| 283 | + name: 'research-with-search', |
| 284 | + description: |
| 285 | + 'Run a web search with fastCRW, then scrape the top results into a cited research brief.', |
| 286 | + content: |
| 287 | + '# Research With Search\n\nUse fastCRW to gather and synthesize web sources on a topic.\n\n## Steps\n1. Use the Search operation with the research query and a result Limit.\n2. For the most relevant results, use Scrape to pull the full page markdown.\n3. Synthesize the findings into a brief, attributing each claim to its source URL.\n\n## Output\nReturn a structured research brief with key findings and a Sources list of the URLs used. Keep claims grounded in the scraped content.', |
| 288 | + }, |
| 289 | + ], |
| 290 | +} as const satisfies BlockMeta |
0 commit comments