From 2964aed418553ce773107e264d26c1161525abe2 Mon Sep 17 00:00:00 2001 From: us Date: Sat, 13 Jun 2026 20:21:31 +0300 Subject: [PATCH 1/2] feat: add fastCRW tool block --- .../settings/components/byok/byok.tsx | 9 + apps/sim/blocks/blocks/crw.ts | 290 ++++++++++++++++++ apps/sim/blocks/registry.ts | 3 + apps/sim/components/icons.tsx | 19 ++ apps/sim/lib/api/contracts/byok-keys.ts | 1 + apps/sim/lib/core/security/csp.ts | 1 + apps/sim/lib/integrations/icon-mapping.ts | 2 + apps/sim/lib/integrations/integrations.json | 35 +++ apps/sim/tools/crw/base-url.ts | 17 + apps/sim/tools/crw/crawl.ts | 209 +++++++++++++ apps/sim/tools/crw/crw.test.ts | 159 ++++++++++ apps/sim/tools/crw/index.ts | 9 + apps/sim/tools/crw/map.ts | 94 ++++++ apps/sim/tools/crw/scrape.ts | 109 +++++++ apps/sim/tools/crw/search.ts | 87 ++++++ apps/sim/tools/crw/types.ts | 276 +++++++++++++++++ apps/sim/tools/registry.ts | 5 + apps/sim/tools/types.ts | 1 + 18 files changed, 1326 insertions(+) create mode 100644 apps/sim/blocks/blocks/crw.ts create mode 100644 apps/sim/tools/crw/base-url.ts create mode 100644 apps/sim/tools/crw/crawl.ts create mode 100644 apps/sim/tools/crw/crw.test.ts create mode 100644 apps/sim/tools/crw/index.ts create mode 100644 apps/sim/tools/crw/map.ts create mode 100644 apps/sim/tools/crw/scrape.ts create mode 100644 apps/sim/tools/crw/search.ts create mode 100644 apps/sim/tools/crw/types.ts diff --git a/apps/sim/app/workspace/[workspaceId]/settings/components/byok/byok.tsx b/apps/sim/app/workspace/[workspaceId]/settings/components/byok/byok.tsx index 99f25b48bf6..7be7d453f97 100644 --- a/apps/sim/app/workspace/[workspaceId]/settings/components/byok/byok.tsx +++ b/apps/sim/app/workspace/[workspaceId]/settings/components/byok/byok.tsx @@ -6,6 +6,7 @@ import { AnthropicIcon, BasetenIcon, BrandfetchIcon, + CrwIcon, ExaAIIcon, FalIcon, FindymailIcon, @@ -111,6 +112,13 @@ const PROVIDERS: (BYOKManagerProvider & { id: BYOKProviderId })[] = [ description: 'Web scraping, crawling, search, and extraction', placeholder: 'Enter your Firecrawl API key', }, + { + id: 'crw', + name: 'fastCRW', + icon: CrwIcon, + description: 'Web scraping, crawling, search, and mapping', + placeholder: 'Enter your fastCRW API key', + }, { id: 'exa', name: 'Exa', @@ -249,6 +257,7 @@ const PROVIDER_SECTIONS: BYOKProviderSection[] = [ label: 'Search & web', ids: [ 'firecrawl', + 'crw', 'exa', 'serper', 'linkup', diff --git a/apps/sim/blocks/blocks/crw.ts b/apps/sim/blocks/blocks/crw.ts new file mode 100644 index 00000000000..fb8e2036cce --- /dev/null +++ b/apps/sim/blocks/blocks/crw.ts @@ -0,0 +1,290 @@ +import { CrwIcon } from '@/components/icons' +import type { BlockConfig, BlockMeta } from '@/blocks/types' +import { AuthMode, IntegrationType } from '@/blocks/types' +import type { CrwResponse } from '@/tools/crw/types' + +export const CrwBlock: BlockConfig = { + type: 'crw', + name: 'fastCRW', + description: 'Scrape, search, crawl, and map web data', + authMode: AuthMode.ApiKey, + longDescription: + 'Integrate fastCRW into the workflow. Scrape pages, search the web, crawl entire sites, and map URL structures. fastCRW is a Firecrawl-compatible web scraper in a single binary — self-host or cloud.', + docsLink: 'https://docs.sim.ai/integrations/crw', + category: 'tools', + integrationType: IntegrationType.Search, + bgColor: '#181C1E', + icon: CrwIcon, + subBlocks: [ + { + id: 'operation', + title: 'Operation', + type: 'dropdown', + options: [ + { label: 'Scrape', id: 'scrape' }, + { label: 'Search', id: 'search' }, + { label: 'Crawl', id: 'crawl' }, + { label: 'Map', id: 'map' }, + ], + value: () => 'scrape', + }, + { + id: 'url', + title: 'Website URL', + type: 'short-input', + placeholder: 'Enter the website URL', + condition: { + field: 'operation', + value: ['scrape', 'crawl', 'map'], + }, + required: true, + }, + { + id: 'query', + title: 'Search Query', + type: 'short-input', + placeholder: 'Enter the search query', + condition: { + field: 'operation', + value: 'search', + }, + required: true, + }, + { + id: 'onlyMainContent', + title: 'Only Main Content', + type: 'switch', + condition: { + field: 'operation', + value: ['scrape', 'crawl'], + }, + }, + { + id: 'formats', + title: 'Output Formats', + type: 'long-input', + placeholder: '["markdown", "html"]', + condition: { + field: 'operation', + value: ['scrape', 'crawl'], + }, + }, + { + id: 'waitFor', + title: 'Wait For (ms)', + type: 'short-input', + placeholder: '0', + condition: { + field: 'operation', + value: 'scrape', + }, + }, + { + id: 'limit', + title: 'Limit', + type: 'short-input', + placeholder: '100', + condition: { + field: 'operation', + value: ['map', 'search'], + }, + }, + { + id: 'maxPages', + title: 'Max Pages', + type: 'short-input', + placeholder: '100', + condition: { + field: 'operation', + value: 'crawl', + }, + }, + { + id: 'baseUrl', + title: 'Base URL', + type: 'short-input', + placeholder: 'https://fastcrw.com/api', + mode: 'advanced', + }, + { + id: 'apiKey', + title: 'API Key', + type: 'short-input', + placeholder: 'Enter your fastCRW API key', + password: true, + required: true, + hideWhenHosted: true, + }, + ], + tools: { + access: ['crw_scrape', 'crw_search', 'crw_crawl', 'crw_map'], + config: { + tool: (params) => { + switch (params.operation) { + case 'scrape': + return 'crw_scrape' + case 'search': + return 'crw_search' + case 'crawl': + return 'crw_crawl' + case 'map': + return 'crw_map' + default: + return 'crw_scrape' + } + }, + params: (params) => { + const { + operation, + limit, + maxPages, + formats, + waitFor, + url, + query, + onlyMainContent, + baseUrl, + apiKey, + } = params + + const result: Record = { apiKey } + + if (baseUrl) result.baseUrl = baseUrl + + switch (operation) { + case 'scrape': + if (url) result.url = url + if (formats) { + if (Array.isArray(formats)) { + result.formats = formats + } else if (typeof formats === 'string') { + try { + const parsed = JSON.parse(formats) + result.formats = Array.isArray(parsed) ? parsed : ['markdown'] + } catch { + result.formats = ['markdown'] + } + } + } + if (waitFor) result.waitFor = Number.parseInt(waitFor) + if (onlyMainContent != null) result.onlyMainContent = onlyMainContent + break + + case 'search': + if (query) result.query = query + if (limit) result.limit = Number.parseInt(limit) + break + + case 'crawl': + if (url) result.url = url + if (maxPages) result.maxPages = Number.parseInt(maxPages) + if (formats) { + if (Array.isArray(formats)) { + result.formats = formats + } else if (typeof formats === 'string') { + try { + const parsed = JSON.parse(formats) + result.formats = Array.isArray(parsed) ? parsed : ['markdown'] + } catch { + result.formats = ['markdown'] + } + } + } + if (onlyMainContent != null) result.onlyMainContent = onlyMainContent + break + + case 'map': + if (url) result.url = url + if (limit) result.limit = Number.parseInt(limit) + break + } + + return result + }, + }, + }, + inputs: { + apiKey: { type: 'string', description: 'fastCRW API key' }, + baseUrl: { type: 'string', description: 'Base URL for self-hosted fastCRW' }, + operation: { type: 'string', description: 'Operation to perform' }, + url: { type: 'string', description: 'Target website URL' }, + query: { type: 'string', description: 'Search query terms' }, + limit: { type: 'string', description: 'Result/link limit' }, + maxPages: { type: 'string', description: 'Maximum pages to crawl' }, + formats: { type: 'json', description: 'Output formats array' }, + waitFor: { type: 'number', description: 'Wait time before scraping in ms' }, + onlyMainContent: { type: 'boolean', description: 'Extract only main content' }, + scrapeOptions: { type: 'json', description: 'Advanced scraping options' }, + }, + outputs: { + // Scrape output + markdown: { type: 'string', description: 'Page content markdown' }, + html: { type: 'string', description: 'Raw HTML content' }, + metadata: { type: 'json', description: 'Page metadata' }, + // Search output + data: { type: 'json', description: 'Search results data' }, + // Crawl output + pages: { type: 'json', description: 'Crawled pages data' }, + total: { type: 'number', description: 'Total pages found' }, + // Map output + success: { type: 'boolean', description: 'Operation success status' }, + links: { type: 'json', description: 'Discovered URLs array' }, + }, +} + +export const CrwBlockMeta = { + tags: ['web-scraping', 'automation'], + templates: [ + { + icon: CrwIcon, + title: 'fastCRW competitor site monitor', + prompt: + 'Build a scheduled workflow that uses fastCRW to scrape competitor pricing, product, and changelog pages weekly, diffs against the prior snapshot, and posts changes to Slack.', + modules: ['scheduled', 'agent', 'workflows'], + category: 'marketing', + tags: ['marketing', 'monitoring'], + alsoIntegrations: ['slack'], + }, + { + icon: CrwIcon, + title: 'fastCRW knowledge-base builder', + prompt: + 'Build a workflow that crawls a documentation site with fastCRW, chunks and embeds the pages, and upserts them into a knowledge base for an answering agent.', + modules: ['knowledge-base', 'agent', 'workflows'], + category: 'engineering', + tags: ['research', 'sync'], + }, + { + icon: CrwIcon, + title: 'fastCRW research stack', + prompt: + 'Create an agent that uses fastCRW Search to find authoritative URLs on a topic, scrapes each with fastCRW, and produces a structured research brief with citations.', + modules: ['agent', 'files', 'workflows'], + category: 'productivity', + tags: ['research'], + }, + ], + skills: [ + { + name: 'scrape-page-to-markdown', + description: + 'Scrape a single URL with fastCRW and return clean main-content markdown for an agent to read.', + content: + '# Scrape Page to Markdown\n\nUse fastCRW to fetch a web page as clean, LLM-ready markdown.\n\n## Steps\n1. Use the Scrape operation on the target URL.\n2. Enable Only Main Content to strip navigation, ads, and footers; set a Wait For delay if the page renders content with JavaScript.\n3. Return the markdown output and capture page metadata (title, description).\n\n## Output\nReturn the page markdown plus key metadata. If the page failed to load or returned empty content, report that instead of fabricating text.', + }, + { + name: 'crawl-site', + description: + 'Crawl an entire site or section with fastCRW and return the page content for indexing or analysis.', + content: + '# Crawl Site\n\nUse fastCRW to traverse a site and collect its pages.\n\n## Steps\n1. Use the Crawl operation on the root URL, setting a sensible Max Pages limit to control cost.\n2. Enable Only Main Content so each page comes back as clean markdown.\n3. Collect the crawled pages and their URLs from the response.\n\n## Output\nReturn the list of crawled pages with their URL and markdown content, plus the total page count. This output is ready to chunk and embed into a knowledge base.', + }, + { + name: 'research-with-search', + description: + 'Run a web search with fastCRW, then scrape the top results into a cited research brief.', + content: + '# Research With Search\n\nUse fastCRW to gather and synthesize web sources on a topic.\n\n## Steps\n1. Use the Search operation with the research query and a result Limit.\n2. For the most relevant results, use Scrape to pull the full page markdown.\n3. Synthesize the findings into a brief, attributing each claim to its source URL.\n\n## Output\nReturn a structured research brief with key findings and a Sources list of the URLs used. Keep claims grounded in the scraped content.', + }, + ], +} as const satisfies BlockMeta diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts index 2cacc4576c6..3a23e65c5c3 100644 --- a/apps/sim/blocks/registry.ts +++ b/apps/sim/blocks/registry.ts @@ -41,6 +41,7 @@ import { ConfluenceBlock, ConfluenceBlockMeta, ConfluenceV2Block } from '@/block import { ConvexBlock, ConvexBlockMeta } from '@/blocks/blocks/convex' import { CredentialBlock } from '@/blocks/blocks/credential' import { CrowdStrikeBlock, CrowdStrikeBlockMeta } from '@/blocks/blocks/crowdstrike' +import { CrwBlock, CrwBlockMeta } from '@/blocks/blocks/crw' import { CursorBlock, CursorBlockMeta, CursorV2Block } from '@/blocks/blocks/cursor' import { DagsterBlock, DagsterBlockMeta } from '@/blocks/blocks/dagster' import { DatabricksBlock, DatabricksBlockMeta } from '@/blocks/blocks/databricks' @@ -406,6 +407,7 @@ const BLOCK_REGISTRY: Record = { zerobounce: ZeroBounceBlock, neverbounce: NeverBounceBlock, millionverifier: MillionVerifierBlock, + crw: CrwBlock, firecrawl: FirecrawlBlock, fireflies: FirefliesBlock, fireflies_v2: FirefliesV2Block, @@ -669,6 +671,7 @@ const BLOCK_META_REGISTRY: Record = { confluence: ConfluenceBlockMeta, convex: ConvexBlockMeta, crowdstrike: CrowdStrikeBlockMeta, + crw: CrwBlockMeta, cursor: CursorBlockMeta, dagster: DagsterBlockMeta, databricks: DatabricksBlockMeta, diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx index 162de3ad5f9..d9b559031d9 100644 --- a/apps/sim/components/icons.tsx +++ b/apps/sim/components/icons.tsx @@ -604,6 +604,25 @@ export function FirecrawlIcon(props: SVGProps) { ) } +export function CrwIcon(props: SVGProps) { + return ( + + + + + ) +} + export function JinaAIIcon(props: SVGProps) { return ( = { confluence_v2: ConfluenceIcon, convex: ConvexIcon, crowdstrike: CrowdStrikeIcon, + crw: CrwIcon, cursor_v2: CursorIcon, dagster: DagsterIcon, databricks: DatabricksIcon, diff --git a/apps/sim/lib/integrations/integrations.json b/apps/sim/lib/integrations/integrations.json index 853a8728a02..55ec6a3f19b 100644 --- a/apps/sim/lib/integrations/integrations.json +++ b/apps/sim/lib/integrations/integrations.json @@ -4676,6 +4676,41 @@ "integrationType": "ai", "tags": ["document-processing", "ocr"] }, + { + "type": "crw", + "slug": "fastcrw", + "name": "fastCRW", + "description": "Scrape, search, crawl, and map web data", + "longDescription": "Integrate fastCRW into the workflow. Scrape pages, search the web, crawl entire sites, and map URL structures. fastCRW is a Firecrawl-compatible web scraper in a single binary — self-host or cloud.", + "bgColor": "#181C1E", + "iconName": "CrwIcon", + "docsUrl": "https://docs.sim.ai/integrations/crw", + "operations": [ + { + "name": "Scrape", + "description": "Extract structured content from web pages with comprehensive metadata support. Converts content to markdown or HTML while capturing SEO metadata, Open Graph tags, and page information." + }, + { + "name": "Search", + "description": "Search for information on the web using fastCRW" + }, + { + "name": "Crawl", + "description": "Crawl entire websites and extract structured content from all accessible pages" + }, + { + "name": "Map", + "description": "Get a complete list of URLs from any website quickly and reliably. Useful for discovering all pages on a site without crawling them." + } + ], + "operationCount": 4, + "triggers": [], + "triggerCount": 0, + "authType": "api-key", + "category": "tools", + "integrationType": "search", + "tags": ["web-scraping", "automation"] + }, { "type": "fathom", "slug": "fathom", diff --git a/apps/sim/tools/crw/base-url.ts b/apps/sim/tools/crw/base-url.ts new file mode 100644 index 00000000000..b5a8341a369 --- /dev/null +++ b/apps/sim/tools/crw/base-url.ts @@ -0,0 +1,17 @@ +/** + * Default base URL for the managed fastCRW cloud. + * + * fastCRW is Firecrawl-compatible and can also be self-hosted (single Rust + * binary). Pass a `baseUrl` to point a tool at a self-hosted server + * (e.g., "http://localhost:3000"); otherwise the managed cloud is used. + */ +export const DEFAULT_CRW_BASE_URL = 'https://fastcrw.com/api' + +/** + * Resolve the fastCRW base URL, falling back to the managed cloud and stripping + * any trailing slash so endpoint paths can be appended cleanly. + */ +export function resolveCrwBaseUrl(baseUrl?: string): string { + const url = baseUrl?.trim() || DEFAULT_CRW_BASE_URL + return url.replace(/\/+$/, '') +} diff --git a/apps/sim/tools/crw/crawl.ts b/apps/sim/tools/crw/crawl.ts new file mode 100644 index 00000000000..1ae3ced76a5 --- /dev/null +++ b/apps/sim/tools/crw/crawl.ts @@ -0,0 +1,209 @@ +import { createLogger } from '@sim/logger' +import { sleep } from '@sim/utils/helpers' +import { DEFAULT_EXECUTION_TIMEOUT_MS } from '@/lib/core/execution-limits' +import { resolveCrwBaseUrl } from '@/tools/crw/base-url' +import type { CrwCrawlParams, CrwCrawlResponse } from '@/tools/crw/types' +import { CRAWLED_PAGE_OUTPUT_PROPERTIES } from '@/tools/crw/types' +import type { ToolConfig } from '@/tools/types' + +const logger = createLogger('CrwCrawlTool') + +const POLL_INTERVAL_MS = 5000 +const MAX_POLL_TIME_MS = DEFAULT_EXECUTION_TIMEOUT_MS + +export const crawlTool: ToolConfig = { + id: 'crw_crawl', + name: 'fastCRW Crawl', + description: 'Crawl entire websites and extract structured content from all accessible pages', + version: '1.0.0', + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: + 'The website URL to crawl (e.g., "https://example.com" or "https://docs.example.com/guide")', + }, + maxPages: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of pages to crawl (e.g., 50, 100, 500). Default: 100', + }, + maxDepth: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: + 'Maximum depth to crawl from the starting URL (e.g., 1, 2, 3). Controls how many levels deep to follow links', + }, + formats: { + type: 'json', + required: false, + visibility: 'user-or-llm', + description: + 'Output formats for scraped content (e.g., ["markdown"], ["markdown", "html"], ["markdown", "links"])', + }, + excludePaths: { + type: 'json', + required: false, + visibility: 'user-or-llm', + description: 'URL paths to exclude from crawling (e.g., ["/blog/*", "/admin/*", "/*.pdf"])', + }, + includePaths: { + type: 'json', + required: false, + visibility: 'user-or-llm', + description: + 'URL paths to include in crawling (e.g., ["/docs/*", "/api/*"]). Only these paths will be crawled', + }, + onlyMainContent: { + type: 'boolean', + required: false, + visibility: 'user-only', + description: 'Extract only main content from pages', + }, + baseUrl: { + type: 'string', + required: false, + visibility: 'user-only', + description: 'Base URL for self-hosted fastCRW (defaults to https://fastcrw.com/api)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'fastCRW API key', + }, + }, + + hosting: { + envKeyPrefix: 'CRW_API_KEY', + apiKeyParam: 'apiKey', + byokProviderId: 'crw', + // fastCRW is BYOK-only — Sim does not meter usage. + pricing: { type: 'per_request', cost: 0 }, + rateLimit: { + mode: 'per_request', + requestsPerMinute: 100, + }, + }, + + request: { + url: (params) => `${resolveCrwBaseUrl(params.baseUrl)}/v1/crawl`, + method: 'POST', + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + url: params.url, + maxPages: Number(params.maxPages) || 100, + scrapeOptions: params.scrapeOptions || { + formats: params.formats || ['markdown'], + onlyMainContent: params.onlyMainContent || false, + }, + } + + if (params.maxDepth) body.maxDepth = Number(params.maxDepth) + if (params.excludePaths) body.excludePaths = params.excludePaths + if (params.includePaths) body.includePaths = params.includePaths + + return body + }, + }, + transformResponse: async (response: Response) => { + const data = await response.json() + + return { + success: true, + output: { + jobId: data.id, + pages: [], + total: 0, + }, + } + }, + postProcess: async (result, params) => { + if (!result.success) { + return result + } + + const jobId = result.output.jobId + const baseUrl = resolveCrwBaseUrl(params.baseUrl) + logger.info(`fastCRW crawl job ${jobId} created, polling for completion...`) + + let elapsedTime = 0 + + while (elapsedTime < MAX_POLL_TIME_MS) { + try { + const statusResponse = await fetch(`${baseUrl}/v1/crawl/${jobId}`, { + method: 'GET', + headers: { + Authorization: `Bearer ${params.apiKey}`, + 'Content-Type': 'application/json', + }, + }) + + if (!statusResponse.ok) { + throw new Error(`Failed to get crawl status: ${statusResponse.statusText}`) + } + + const crawlData = await statusResponse.json() + logger.info(`fastCRW crawl job ${jobId} status: ${crawlData.status}`) + + if (crawlData.status === 'completed') { + result.output = { + pages: crawlData.data || [], + total: crawlData.total || (crawlData.data || []).length, + } + return result + } + + if (crawlData.status === 'failed') { + return { + ...result, + success: false, + error: `Crawl job failed: ${crawlData.error || 'Unknown error'}`, + } + } + + await sleep(POLL_INTERVAL_MS) + elapsedTime += POLL_INTERVAL_MS + } catch (error: any) { + logger.error('Error polling for crawl job status:', { + message: error.message || 'Unknown error', + jobId, + }) + + return { + ...result, + success: false, + error: `Error polling for crawl job status: ${error.message || 'Unknown error'}`, + } + } + } + + logger.warn( + `Crawl job ${jobId} did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)` + ) + return { + ...result, + success: false, + error: `Crawl job did not complete within the maximum polling time (${MAX_POLL_TIME_MS / 1000}s)`, + } + }, + + outputs: { + pages: { + type: 'array', + description: 'Array of crawled pages with their content and metadata', + items: { + type: 'object', + properties: CRAWLED_PAGE_OUTPUT_PROPERTIES, + }, + }, + total: { type: 'number', description: 'Total number of pages found during crawl' }, + }, +} diff --git a/apps/sim/tools/crw/crw.test.ts b/apps/sim/tools/crw/crw.test.ts new file mode 100644 index 00000000000..85cba32d33c --- /dev/null +++ b/apps/sim/tools/crw/crw.test.ts @@ -0,0 +1,159 @@ +/** + * @vitest-environment node + */ +import { describe, expect, it } from 'vitest' +import { resolveCrwBaseUrl } from '@/tools/crw/base-url' +import { crawlTool } from '@/tools/crw/crawl' +import { mapTool } from '@/tools/crw/map' +import { scrapeTool } from '@/tools/crw/scrape' +import { searchTool } from '@/tools/crw/search' + +const respond = (body: unknown) => new Response(JSON.stringify(body)) + +describe('crw base-url', () => { + it('defaults to the managed cloud base URL', () => { + expect(resolveCrwBaseUrl()).toBe('https://fastcrw.com/api') + expect(resolveCrwBaseUrl('')).toBe('https://fastcrw.com/api') + }) + + it('honors a self-hosted override and strips trailing slashes', () => { + expect(resolveCrwBaseUrl('http://localhost:3000')).toBe('http://localhost:3000') + expect(resolveCrwBaseUrl('http://localhost:3000/')).toBe('http://localhost:3000') + }) +}) + +describe('crw scrape', () => { + const buildUrl = scrapeTool.request.url as (p: Record) => string + const buildBody = scrapeTool.request.body as ( + p: Record + ) => Record + const transform = scrapeTool.transformResponse! + + it('targets the default scrape endpoint and overrides for self-host', () => { + expect(buildUrl({ apiKey: 'k', url: 'https://example.com' })).toBe( + 'https://fastcrw.com/api/v1/scrape' + ) + expect(buildUrl({ apiKey: 'k', url: 'https://example.com', baseUrl: 'http://localhost:3000' })).toBe( + 'http://localhost:3000/v1/scrape' + ) + }) + + it('defaults formats to markdown and forwards optional params', () => { + const body = buildBody({ + apiKey: 'k', + url: 'https://example.com', + onlyMainContent: true, + waitFor: 500, + }) + expect(body).toMatchObject({ + url: 'https://example.com', + formats: ['markdown'], + onlyMainContent: true, + waitFor: 500, + }) + }) + + it('maps the documented scrape response (data envelope)', async () => { + const result = await transform( + respond({ + success: true, + data: { + markdown: '# Hello', + html: '

Hello

', + metadata: { title: 'Hello', sourceURL: 'https://example.com', statusCode: 200 }, + }, + }) + ) + + expect(result.success).toBe(true) + expect(result.output).toMatchObject({ + markdown: '# Hello', + html: '

Hello

', + metadata: { title: 'Hello', sourceURL: 'https://example.com', statusCode: 200 }, + }) + }) +}) + +describe('crw search', () => { + const buildUrl = searchTool.request.url as (p: Record) => string + const buildBody = searchTool.request.body as ( + p: Record + ) => Record + const transform = searchTool.transformResponse! + + it('targets the search endpoint', () => { + expect(buildUrl({ apiKey: 'k', query: 'sim' })).toBe('https://fastcrw.com/api/v1/search') + }) + + it('coerces limit and forwards sources', () => { + const body = buildBody({ apiKey: 'k', query: 'sim', limit: 5, sources: ['web'] }) + expect(body).toEqual({ query: 'sim', limit: 5, sources: ['web'] }) + }) + + it('maps the documented search response', async () => { + const result = await transform( + respond({ + success: true, + data: [{ title: 'Sim', url: 'https://sim.ai', description: 'AI workspace' }], + }) + ) + expect(result.output.data).toEqual([ + { title: 'Sim', url: 'https://sim.ai', description: 'AI workspace' }, + ]) + }) +}) + +describe('crw map', () => { + const buildUrl = mapTool.request.url as (p: Record) => string + const transform = mapTool.transformResponse! + + it('targets the map endpoint', () => { + expect(buildUrl({ apiKey: 'k', url: 'https://example.com' })).toBe( + 'https://fastcrw.com/api/v1/map' + ) + }) + + it('maps the documented map response', async () => { + const result = await transform( + respond({ success: true, links: ['https://example.com', 'https://example.com/about'] }) + ) + expect(result.success).toBe(true) + expect(result.output.links).toEqual(['https://example.com', 'https://example.com/about']) + }) + + it('returns an empty links array when none are present', async () => { + const result = await transform(respond({ success: true })) + expect(result.output.links).toEqual([]) + }) +}) + +describe('crw crawl', () => { + const buildUrl = crawlTool.request.url as (p: Record) => string + const buildBody = crawlTool.request.body as ( + p: Record + ) => Record + const transform = crawlTool.transformResponse! + + it('targets the crawl endpoint', () => { + expect(buildUrl({ apiKey: 'k', url: 'https://example.com' })).toBe( + 'https://fastcrw.com/api/v1/crawl' + ) + }) + + it('defaults maxPages and builds scrapeOptions', () => { + const body = buildBody({ apiKey: 'k', url: 'https://example.com', formats: ['markdown'] }) + expect(body).toMatchObject({ + url: 'https://example.com', + maxPages: 100, + scrapeOptions: { formats: ['markdown'], onlyMainContent: false }, + }) + }) + + it('returns the job id from the async create response', async () => { + const result = await transform(respond({ success: true, id: 'job-123' })) + expect(result.success).toBe(true) + expect(result.output.jobId).toBe('job-123') + expect(result.output.pages).toEqual([]) + expect(result.output.total).toBe(0) + }) +}) diff --git a/apps/sim/tools/crw/index.ts b/apps/sim/tools/crw/index.ts new file mode 100644 index 00000000000..ea8d68ace11 --- /dev/null +++ b/apps/sim/tools/crw/index.ts @@ -0,0 +1,9 @@ +import { crawlTool } from '@/tools/crw/crawl' +import { mapTool } from '@/tools/crw/map' +import { scrapeTool } from '@/tools/crw/scrape' +import { searchTool } from '@/tools/crw/search' + +export const crwScrapeTool = scrapeTool +export const crwSearchTool = searchTool +export const crwCrawlTool = crawlTool +export const crwMapTool = mapTool diff --git a/apps/sim/tools/crw/map.ts b/apps/sim/tools/crw/map.ts new file mode 100644 index 00000000000..a266354988a --- /dev/null +++ b/apps/sim/tools/crw/map.ts @@ -0,0 +1,94 @@ +import { resolveCrwBaseUrl } from '@/tools/crw/base-url' +import type { MapParams, MapResponse } from '@/tools/crw/types' +import type { ToolConfig } from '@/tools/types' + +export const mapTool: ToolConfig = { + id: 'crw_map', + name: 'fastCRW Map', + description: + 'Get a complete list of URLs from any website quickly and reliably. Useful for discovering all pages on a site without crawling them.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The base URL to map and discover links from (e.g., "https://example.com")', + }, + limit: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of links to return (e.g., 100, 1000, 5000)', + }, + baseUrl: { + type: 'string', + required: false, + visibility: 'user-only', + description: 'Base URL for self-hosted fastCRW (defaults to https://fastcrw.com/api)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'fastCRW API key', + }, + }, + + hosting: { + envKeyPrefix: 'CRW_API_KEY', + apiKeyParam: 'apiKey', + byokProviderId: 'crw', + // fastCRW is BYOK-only — Sim does not meter usage. + pricing: { type: 'per_request', cost: 0 }, + rateLimit: { + mode: 'per_request', + requestsPerMinute: 100, + }, + }, + + request: { + method: 'POST', + url: (params) => `${resolveCrwBaseUrl(params.baseUrl)}/v1/map`, + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + url: params.url, + } + + if (params.limit) body.limit = Number(params.limit) + + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await response.json() + + return { + success: data.success, + output: { + success: data.success, + links: data.links || [], + }, + } + }, + + outputs: { + success: { + type: 'boolean', + description: 'Whether the mapping operation was successful', + }, + links: { + type: 'array', + description: 'Array of discovered URLs from the website', + items: { + type: 'string', + }, + }, + }, +} diff --git a/apps/sim/tools/crw/scrape.ts b/apps/sim/tools/crw/scrape.ts new file mode 100644 index 00000000000..45bf2c18801 --- /dev/null +++ b/apps/sim/tools/crw/scrape.ts @@ -0,0 +1,109 @@ +import { resolveCrwBaseUrl } from '@/tools/crw/base-url' +import type { ScrapeParams, ScrapeResponse } from '@/tools/crw/types' +import { PAGE_METADATA_OUTPUT_PROPERTIES } from '@/tools/crw/types' +import { safeAssign } from '@/tools/safe-assign' +import type { ToolConfig } from '@/tools/types' + +export const scrapeTool: ToolConfig = { + id: 'crw_scrape', + name: 'fastCRW Website Scraper', + description: + 'Extract structured content from web pages with comprehensive metadata support. Converts content to markdown or HTML while capturing SEO metadata, Open Graph tags, and page information.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The URL to scrape content from (e.g., "https://example.com/page")', + }, + scrapeOptions: { + type: 'json', + required: false, + visibility: 'hidden', + description: 'Options for content scraping', + }, + baseUrl: { + type: 'string', + required: false, + visibility: 'user-only', + description: 'Base URL for self-hosted fastCRW (defaults to https://fastcrw.com/api)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'fastCRW API key', + }, + }, + + hosting: { + envKeyPrefix: 'CRW_API_KEY', + apiKeyParam: 'apiKey', + byokProviderId: 'crw', + // fastCRW is BYOK-only — Sim does not meter usage. + pricing: { type: 'per_request', cost: 0 }, + rateLimit: { + mode: 'per_request', + requestsPerMinute: 100, + }, + }, + + request: { + method: 'POST', + url: (params) => `${resolveCrwBaseUrl(params.baseUrl)}/v1/scrape`, + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + url: params.url, + formats: params.formats || params.scrapeOptions?.formats || ['markdown'], + } + + if (typeof params.onlyMainContent === 'boolean') body.onlyMainContent = params.onlyMainContent + if (params.includeTags) body.includeTags = params.includeTags + if (params.excludeTags) body.excludeTags = params.excludeTags + if (params.headers) body.headers = params.headers + if (params.waitFor) body.waitFor = Number(params.waitFor) + if (params.renderJs != null) body.renderJs = params.renderJs + if (params.cssSelector) body.cssSelector = params.cssSelector + if (params.xpath) body.xpath = params.xpath + if (params.jsonSchema) body.jsonSchema = params.jsonSchema + if (params.proxy) body.proxy = params.proxy + if (typeof params.stealth === 'boolean') body.stealth = params.stealth + + if (params.scrapeOptions) { + safeAssign(body, params.scrapeOptions as Record) + } + + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await response.json() + const result = data.data ?? data + + return { + success: true, + output: { + markdown: result.markdown, + html: result.html, + metadata: result.metadata, + }, + } + }, + + outputs: { + markdown: { type: 'string', description: 'Page content in markdown format' }, + html: { type: 'string', description: 'Raw HTML content of the page', optional: true }, + metadata: { + type: 'object', + description: 'Page metadata including SEO and Open Graph information', + properties: PAGE_METADATA_OUTPUT_PROPERTIES, + }, + }, +} diff --git a/apps/sim/tools/crw/search.ts b/apps/sim/tools/crw/search.ts new file mode 100644 index 00000000000..fce4f8c808d --- /dev/null +++ b/apps/sim/tools/crw/search.ts @@ -0,0 +1,87 @@ +import { resolveCrwBaseUrl } from '@/tools/crw/base-url' +import type { SearchParams, SearchResponse } from '@/tools/crw/types' +import { SEARCH_RESULT_OUTPUT_PROPERTIES } from '@/tools/crw/types' +import type { ToolConfig } from '@/tools/types' + +export const searchTool: ToolConfig = { + id: 'crw_search', + name: 'fastCRW Search', + description: 'Search for information on the web using fastCRW', + version: '1.0.0', + + params: { + query: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The search query to use', + }, + baseUrl: { + type: 'string', + required: false, + visibility: 'user-only', + description: 'Base URL for self-hosted fastCRW (defaults to https://fastcrw.com/api)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'fastCRW API key', + }, + }, + + hosting: { + envKeyPrefix: 'CRW_API_KEY', + apiKeyParam: 'apiKey', + byokProviderId: 'crw', + // fastCRW is BYOK-only — Sim does not meter usage. + pricing: { type: 'per_request', cost: 0 }, + rateLimit: { + mode: 'per_request', + requestsPerMinute: 100, + }, + }, + + request: { + method: 'POST', + url: (params) => `${resolveCrwBaseUrl(params.baseUrl)}/v1/search`, + headers: (params) => ({ + 'Content-Type': 'application/json', + Authorization: `Bearer ${params.apiKey}`, + }), + body: (params) => { + const body: Record = { + query: params.query, + } + + // Add optional parameters if provided (truthy check filters empty strings, null, undefined) + if (params.limit) body.limit = Number(params.limit) + if (params.sources) body.sources = params.sources + if (params.scrapeOptions) body.scrapeOptions = params.scrapeOptions + + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await response.json() + + return { + success: true, + output: { + data: data.data, + }, + } + }, + + outputs: { + data: { + type: 'array', + description: 'Search results data with scraped content and metadata', + items: { + type: 'object', + properties: SEARCH_RESULT_OUTPUT_PROPERTIES, + }, + }, + }, +} diff --git a/apps/sim/tools/crw/types.ts b/apps/sim/tools/crw/types.ts new file mode 100644 index 00000000000..228bbabed0a --- /dev/null +++ b/apps/sim/tools/crw/types.ts @@ -0,0 +1,276 @@ +import type { OutputProperty, ToolResponse } from '@/tools/types' + +/** + * Shared output property definitions for fastCRW API responses. + * + * fastCRW is a Firecrawl-compatible web data engine (single Rust binary, + * self-host or managed cloud). The REST shapes mirror Firecrawl's, so these + * definitions follow the same structure as the Firecrawl provider. + * + * API Reference: https://fastcrw.com/docs/rest-api + * - Scrape: POST /v1/scrape + * - Crawl: POST /v1/crawl, GET /v1/crawl/{id} + * - Search: POST /v1/search + * - Map: POST /v1/map + */ + +/** + * Output definition for page metadata in scrape responses + * Based on the fastCRW metadata object structure from POST /v1/scrape + */ +export const PAGE_METADATA_OUTPUT_PROPERTIES = { + title: { type: 'string', description: 'Page title' }, + description: { type: 'string', description: 'Page meta description', optional: true }, + language: { type: 'string', description: 'Page language code (e.g., "en")', optional: true }, + sourceURL: { type: 'string', description: 'Original source URL that was scraped' }, + statusCode: { type: 'number', description: 'HTTP status code of the response' }, + keywords: { type: 'string', description: 'Page meta keywords', optional: true }, + robots: { + type: 'string', + description: 'Robots meta directive (e.g., "follow, index")', + optional: true, + }, + ogTitle: { type: 'string', description: 'Open Graph title', optional: true }, + ogDescription: { type: 'string', description: 'Open Graph description', optional: true }, + ogUrl: { type: 'string', description: 'Open Graph URL', optional: true }, + ogImage: { type: 'string', description: 'Open Graph image URL', optional: true }, + ogLocaleAlternate: { + type: 'array', + description: 'Alternate locale versions for Open Graph', + optional: true, + items: { type: 'string', description: 'Locale code' }, + }, + ogSiteName: { type: 'string', description: 'Open Graph site name', optional: true }, + error: { type: 'string', description: 'Error message if scrape failed', optional: true }, +} as const satisfies Record + +/** + * Complete page metadata output definition + */ +export const PAGE_METADATA_OUTPUT: OutputProperty = { + type: 'object', + description: 'Page metadata including SEO and Open Graph information', + properties: PAGE_METADATA_OUTPUT_PROPERTIES, +} + +/** + * Simplified metadata for crawl responses (subset of full metadata) + * Based on crawl data[].metadata structure from GET /v1/crawl/{id} + */ +export const CRAWL_METADATA_OUTPUT_PROPERTIES = { + title: { type: 'string', description: 'Page title' }, + description: { type: 'string', description: 'Page meta description', optional: true }, + language: { type: 'string', description: 'Page language code', optional: true }, + sourceURL: { type: 'string', description: 'Original source URL' }, + statusCode: { type: 'number', description: 'HTTP status code' }, + ogLocaleAlternate: { + type: 'array', + description: 'Alternate locale versions', + optional: true, + items: { type: 'string', description: 'Locale code' }, + }, +} as const satisfies Record + +/** + * Complete crawl metadata output definition + */ +export const CRAWL_METADATA_OUTPUT: OutputProperty = { + type: 'object', + description: 'Page metadata from crawl operation', + properties: CRAWL_METADATA_OUTPUT_PROPERTIES, +} + +/** + * Search result metadata properties + * Based on search data[].metadata structure from POST /v1/search + */ +export const SEARCH_METADATA_OUTPUT_PROPERTIES = { + title: { type: 'string', description: 'Page title', optional: true }, + description: { type: 'string', description: 'Page meta description', optional: true }, + sourceURL: { type: 'string', description: 'Original source URL' }, + statusCode: { type: 'number', description: 'HTTP status code', optional: true }, + error: { type: 'string', description: 'Error message if scrape failed', optional: true }, +} as const satisfies Record + +/** + * Complete search metadata output definition + */ +export const SEARCH_METADATA_OUTPUT: OutputProperty = { + type: 'object', + description: 'Metadata about the search result page', + properties: SEARCH_METADATA_OUTPUT_PROPERTIES, +} + +/** + * Output properties for crawled page items + * Based on GET /v1/crawl/{id} response data[] array items + */ +export const CRAWLED_PAGE_OUTPUT_PROPERTIES = { + markdown: { type: 'string', description: 'Page content in markdown format' }, + html: { type: 'string', description: 'Processed HTML content of the page', optional: true }, + rawHtml: { type: 'string', description: 'Unprocessed raw HTML content', optional: true }, + links: { + type: 'array', + description: 'Array of links found on the page', + optional: true, + items: { type: 'string', description: 'URL found on the page' }, + }, + metadata: CRAWL_METADATA_OUTPUT, +} as const satisfies Record + +/** + * Output properties for search result items + * Based on POST /v1/search response data[] array items + */ +export const SEARCH_RESULT_OUTPUT_PROPERTIES = { + title: { type: 'string', description: 'Search result title from search engine' }, + description: { + type: 'string', + description: 'Search result description/snippet from search engine', + }, + url: { type: 'string', description: 'URL of the search result' }, + markdown: { + type: 'string', + description: 'Page content in markdown (when sources include scraped content)', + optional: true, + }, + metadata: SEARCH_METADATA_OUTPUT, +} as const satisfies Record + +// Common types +interface ScrapeOptions { + formats?: string[] + onlyMainContent?: boolean + includeTags?: string[] + excludeTags?: string[] + headers?: Record + waitFor?: number + renderJs?: boolean | null + cssSelector?: string + xpath?: string + jsonSchema?: Record + proxy?: string + stealth?: boolean +} + +export interface ScrapeParams { + apiKey: string + baseUrl?: string + url: string + scrapeOptions?: ScrapeOptions + // Additional top-level scrape params + onlyMainContent?: boolean + formats?: string[] + includeTags?: string[] + excludeTags?: string[] + headers?: Record + waitFor?: number + renderJs?: boolean | null + cssSelector?: string + xpath?: string + jsonSchema?: Record + proxy?: string + stealth?: boolean +} + +export interface SearchParams { + apiKey: string + baseUrl?: string + query: string + limit?: number + sources?: ('web' | 'images')[] + scrapeOptions?: ScrapeOptions +} + +export interface CrwCrawlParams { + apiKey: string + baseUrl?: string + url: string + maxPages?: number + maxDepth?: number + formats?: string[] + onlyMainContent?: boolean + excludePaths?: string[] + includePaths?: string[] + scrapeOptions?: ScrapeOptions +} + +export interface MapParams { + apiKey: string + baseUrl?: string + url: string + limit?: number +} + +export interface ScrapeResponse extends ToolResponse { + output: { + markdown: string + html?: string + rawHtml?: string + links?: string[] + metadata: { + title: string + description?: string + language?: string + keywords?: string + robots?: string + ogTitle?: string + ogDescription?: string + ogUrl?: string + ogImage?: string + ogLocaleAlternate?: string[] + ogSiteName?: string + sourceURL: string + statusCode: number + error?: string + } + } +} + +export interface SearchResponse extends ToolResponse { + output: { + data: Array<{ + title: string + description: string + url: string + markdown?: string + metadata?: { + title?: string + description?: string + sourceURL: string + statusCode?: number + error?: string + } + }> + } +} + +export interface CrwCrawlResponse extends ToolResponse { + output: { + jobId?: string + pages: Array<{ + markdown: string + html?: string + rawHtml?: string + links?: string[] + metadata: { + title: string + description?: string + language?: string + sourceURL: string + statusCode: number + ogLocaleAlternate?: string[] + } + }> + total: number + } +} + +export interface MapResponse extends ToolResponse { + output: { + success: boolean + links: string[] + } +} + +export type CrwResponse = ScrapeResponse | SearchResponse | CrwCrawlResponse | MapResponse diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index 18a16223e61..808cbfc79f7 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -536,6 +536,7 @@ import { crowdstrikeGetSensorDetailsTool, crowdstrikeQuerySensorsTool, } from '@/tools/crowdstrike' +import { crwCrawlTool, crwMapTool, crwScrapeTool, crwSearchTool } from '@/tools/crw' import { cursorAddFollowupTool, cursorAddFollowupV2Tool, @@ -3869,6 +3870,10 @@ export const tools: Record = { file_get_content: fileGetContentTool, file_read: fileReadTool, file_write: fileWriteTool, + crw_scrape: crwScrapeTool, + crw_search: crwSearchTool, + crw_crawl: crwCrawlTool, + crw_map: crwMapTool, firecrawl_scrape: firecrawlScrapeTool, firecrawl_search: firecrawlSearchTool, firecrawl_crawl: firecrawlCrawlTool, diff --git a/apps/sim/tools/types.ts b/apps/sim/tools/types.ts index c8da61e06cc..42bd7652c37 100644 --- a/apps/sim/tools/types.ts +++ b/apps/sim/tools/types.ts @@ -13,6 +13,7 @@ export type BYOKProviderId = | 'ollama-cloud' | 'falai' | 'firecrawl' + | 'crw' | 'exa' | 'serper' | 'jina' From 056eac2d5068d0de7d1b28f4441f23e03fd7046d Mon Sep 17 00:00:00 2001 From: us Date: Sat, 13 Jun 2026 22:40:41 +0300 Subject: [PATCH 2/2] fix: propagate fastCRW API-level errors in crw tool responses scrape, search, and crawl transformResponse hardcoded success: true, masking HTTP 200 responses with { success: false, error }. They now reflect data.success and surface the error, matching map.ts. Crawl additionally fails fast when job creation has no id, preventing a poll loop against /v1/crawl/undefined. Adds error-path tests. --- apps/sim/tools/crw/crawl.ts | 11 +++++++++++ apps/sim/tools/crw/crw.test.ts | 19 +++++++++++++++++++ apps/sim/tools/crw/scrape.ts | 3 ++- apps/sim/tools/crw/search.ts | 3 ++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/apps/sim/tools/crw/crawl.ts b/apps/sim/tools/crw/crawl.ts index 1ae3ced76a5..00135e203c1 100644 --- a/apps/sim/tools/crw/crawl.ts +++ b/apps/sim/tools/crw/crawl.ts @@ -116,6 +116,17 @@ export const crawlTool: ToolConfig = { transformResponse: async (response: Response) => { const data = await response.json() + if (data.success === false || !data.id) { + return { + success: false, + error: data.error || 'fastCRW crawl job creation failed', + output: { + pages: [], + total: 0, + }, + } + } + return { success: true, output: { diff --git a/apps/sim/tools/crw/crw.test.ts b/apps/sim/tools/crw/crw.test.ts index 85cba32d33c..151f2d60900 100644 --- a/apps/sim/tools/crw/crw.test.ts +++ b/apps/sim/tools/crw/crw.test.ts @@ -72,6 +72,12 @@ describe('crw scrape', () => { metadata: { title: 'Hello', sourceURL: 'https://example.com', statusCode: 200 }, }) }) + + it('reports failure when the API body indicates an error', async () => { + const result = await transform(respond({ success: false, error: 'invalid url' })) + expect(result.success).toBe(false) + expect(result.error).toBe('invalid url') + }) }) describe('crw search', () => { @@ -101,6 +107,12 @@ describe('crw search', () => { { title: 'Sim', url: 'https://sim.ai', description: 'AI workspace' }, ]) }) + + it('reports failure when the API body indicates an error', async () => { + const result = await transform(respond({ success: false, error: 'search unavailable' })) + expect(result.success).toBe(false) + expect(result.error).toBe('search unavailable') + }) }) describe('crw map', () => { @@ -156,4 +168,11 @@ describe('crw crawl', () => { expect(result.output.pages).toEqual([]) expect(result.output.total).toBe(0) }) + + it('fails fast when job creation reports an error instead of polling', async () => { + const result = await transform(respond({ success: false, error: 'quota exceeded' })) + expect(result.success).toBe(false) + expect(result.error).toBe('quota exceeded') + expect(result.output.jobId).toBeUndefined() + }) }) diff --git a/apps/sim/tools/crw/scrape.ts b/apps/sim/tools/crw/scrape.ts index 45bf2c18801..d436b90a77a 100644 --- a/apps/sim/tools/crw/scrape.ts +++ b/apps/sim/tools/crw/scrape.ts @@ -88,7 +88,8 @@ export const scrapeTool: ToolConfig = { const result = data.data ?? data return { - success: true, + success: data.success !== false, + error: data.success === false ? data.error || 'fastCRW scrape failed' : undefined, output: { markdown: result.markdown, html: result.html, diff --git a/apps/sim/tools/crw/search.ts b/apps/sim/tools/crw/search.ts index fce4f8c808d..93d66ef9709 100644 --- a/apps/sim/tools/crw/search.ts +++ b/apps/sim/tools/crw/search.ts @@ -67,7 +67,8 @@ export const searchTool: ToolConfig = { const data = await response.json() return { - success: true, + success: data.success !== false, + error: data.success === false ? data.error || 'fastCRW search failed' : undefined, output: { data: data.data, },