apify · Mantisus · Jun 14, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 16, 2026
diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx
@@ -49,6 +49,8 @@ class ParselCrawler
 
 class BeautifulSoupCrawler
 
+class PydanticAiCrawler
+
 class PlaywrightCrawler
 
 class AdaptivePlaywrightCrawler
@@ -65,18 +67,20 @@ BasicCrawler --|> AdaptivePlaywrightCrawler
 AbstractHttpCrawler --|> HttpCrawler
 AbstractHttpCrawler --|> ParselCrawler
 AbstractHttpCrawler --|> BeautifulSoupCrawler
+AbstractHttpCrawler --|> PydanticAiCrawler
 PlaywrightCrawler --|> StagehandCrawler
 ```
 
 ### HTTP crawlers
 
 HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).
 
-HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category:
+HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are four crawlers that belong to this category:
 
 - <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.
 - <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.
 - <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required.
+- <ApiLink to="class/PydanticAiCrawler">`PydanticAiCrawler`</ApiLink> parses HTML with Parsel and uses an LLM to extract structured data into a validated Pydantic model.
 
 You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).
 
@@ -120,6 +124,8 @@ class ParselCrawlingContext
 
 class BeautifulSoupCrawlingContext
 
+class PydanticAiCrawlingContext
+
 class PlaywrightPreNavCrawlingContext
 
 class PlaywrightCrawlingContext
@@ -148,6 +154,8 @@ ParsedHttpCrawlingContext --|> ParselCrawlingContext
 
 ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext
 
+ParselCrawlingContext --|> PydanticAiCrawlingContext
+
 BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext
 
 PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext
@@ -168,6 +176,7 @@ They have a similar inheritance structure as the crawlers, with the base class b
 - <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.
 - <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
 - <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.
+- <ApiLink to="class/PydanticAiCrawlingContext">`PydanticAiCrawlingContext`</ApiLink> for the AI crawler, extending the Parsel context with an `extract` helper.
 - <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated.
 - <ApiLink to="class/PlaywrightCrawlingContext">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers.
 - <ApiLink to="class/AdaptivePlaywrightPreNavCrawlingContext">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated.

diff --git a/docs/guides/code_examples/pydantic_ai_crawler/additional_instructions_example.py b/docs/guides/code_examples/pydantic_ai_crawler/additional_instructions_example.py
@@ -0,0 +1,44 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import PydanticAiCrawler, PydanticAiCrawlingContext
+
+
+class Post(BaseModel):
+    """Model representing a single post."""
+
+    title: str
+    url: str
+
+
+class Posts(BaseModel):
+    """Model representing the extracted list of posts."""
+
+    posts: list[Post]
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    crawler = PydanticAiCrawler(model=model, max_requests_per_crawl=5)
+
+    @crawler.router.default_handler
+    async def handler(context: PydanticAiCrawlingContext) -> None:
+        # The instruction narrows what the model returns from the page.
+        posts = await context.extract(
+            Posts,
+            additional_instructions='Extract only the top five posts on the page.',
+        )
+
+        await context.push_data(posts.model_dump())
+
+    await crawler.run(['https://news.ycombinator.com'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/pydantic_ai_crawler/basic_example.py b/docs/guides/code_examples/pydantic_ai_crawler/basic_example.py
@@ -0,0 +1,43 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import PydanticAiCrawler, PydanticAiCrawlingContext
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    short_text: str
+
+
+async def main() -> None:
+    # A `Model` instance sets the API key explicitly. A provider-prefixed string such as
+    # 'openai:gpt-5.4-nano' reads the key from the provider's env var like OPENAI_API_KEY.
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+
+    # With only `model`, the crawler uses a PydanticAiDirectExtractor by default.
+    crawler = PydanticAiCrawler(model=model, max_requests_per_crawl=5)
+
+    @crawler.router.default_handler
+    async def handler(context: PydanticAiCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Pass a Pydantic model and get a validated instance back.
+        article = await context.extract(Article)
+
+        await context.push_data(article.model_dump())
+
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/pydantic_ai_crawler/custom_distiller_example.py b/docs/guides/code_examples/pydantic_ai_crawler/custom_distiller_example.py
@@ -0,0 +1,67 @@
+import asyncio
+
+from html_to_markdown import convert
+from lxml_html_clean import Cleaner
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import (
+    BasePydanticAiHtmlDistiller,
+    PydanticAiCrawler,
+    PydanticAiCrawlingContext,
+    PydanticAiDirectExtractor,
+    get_basic_http_cleaner,
+)
+
+# Notes appended to the model instructions so it knows the input format.
+MARKDOWN_PROMPT_NOTES = 'The document is Markdown converted from the HTML page.'
+
+
+class MarkdownDistiller(BasePydanticAiHtmlDistiller):
+    """Distiller that cleans the page HTML and converts it to Markdown."""
+
+    def __init__(self, cleaner: Cleaner | None = None) -> None:
+        super().__init__(prompt_notes=MARKDOWN_PROMPT_NOTES)
+
+        # Strip scripts, styles, and other noise before the conversion.
+        self._cleaner = cleaner or get_basic_http_cleaner()
+
+    def distill(self, html: str) -> str:
+        return convert(self._cleaner.clean_html(html)).content or ''
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    short_text: str
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        # Set the provider with the API key explicitly.
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    crawler = PydanticAiCrawler(
+        # Use the custom distiller to convert the page to Markdown before extraction.
+        extractor=PydanticAiDirectExtractor(model=model, distiller=MarkdownDistiller()),
+        max_requests_per_crawl=5,
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: PydanticAiCrawlingContext) -> None:
+        # Pass a Pydantic model and get a validated instance back.
+        article = await context.extract(Article)
+        await context.push_data(article.model_dump())
+
+        # Enqueue links as usual, the distillation and extraction don't affect
+        # the rest of the crawling logic.
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/pydantic_ai_crawler/debugging_example.py b/docs/guides/code_examples/pydantic_ai_crawler/debugging_example.py
@@ -0,0 +1,74 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai import capture_run_messages
+from pydantic_ai.exceptions import UnexpectedModelBehavior
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee import ConcurrencySettings
+from crawlee.crawlers import (
+    PydanticAiCleanHtmlDistiller,
+    PydanticAiCrawler,
+    PydanticAiCrawlingContext,
+    PydanticAiDirectExtractor,
+)
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    short_text: str
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    # Build the distiller once so the extractor and the handler below share
+    # the same instance.
+    distiller = PydanticAiCleanHtmlDistiller()
+    crawler = PydanticAiCrawler(
+        max_requests_per_crawl=10,
+        # Create a direct extractor with your distiller.
+        extractor=PydanticAiDirectExtractor(
+            model,
+            distiller=distiller,
+        ),
+        # Set concurrency to 1, which ensures only one request is processed at a time.
+        concurrency_settings=ConcurrencySettings(
+            desired_concurrency=1, max_concurrency=1
+        ),
+        # Set abort_on_error to True to stop the crawl if an error occurs during
+        # extraction.
+        abort_on_error=True,
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: PydanticAiCrawlingContext) -> None:
+        # Inspect the distilled document the model actually reads, using the same
+        # distiller the extractor runs. On real pages this can be tens of KB.
+        distilled = distiller.distill(context.selector.get())
+        context.log.info(distilled)
+
+        # Capture the prompts, responses, and retries exchanged with the model.
+        with capture_run_messages() as messages:
+            try:
+                article = await context.extract(Article)
+            except UnexpectedModelBehavior:
+                context.log.exception(f'Extraction failed for {context.request.url}.')
+                raise
+            finally:
+                # Log each exchanged message on its own line for readability.
+                for message in messages:
+                    context.log.info(f'{message}')
+
+        await context.push_data(article.model_dump())
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/pydantic_ai_crawler/selector_extractor_example.py b/docs/guides/code_examples/pydantic_ai_crawler/selector_extractor_example.py
@@ -0,0 +1,56 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee import Glob
+from crawlee.crawlers import (
+    PydanticAiCrawler,
+    PydanticAiCrawlingContext,
+    PydanticAiDirectExtractor,
+    PydanticAiSelectorExtractor,
+)
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    main_text: str
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    crawler = PydanticAiCrawler(
+        extractor=PydanticAiSelectorExtractor(
+            model=model,
+            # Pages the cached selectors cannot handle fall back to direct extraction.
+            fallback=PydanticAiDirectExtractor(model=model),
+        ),
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: PydanticAiCrawlingContext) -> None:
+        # Enqueue blog article pages; the article handler extracts the data.
+        await context.enqueue_links(
+            include=[Glob('https://crawlee.dev/blog/*')],
+            label='article',
+        )
+
+    @crawler.router.handler('article')
+    async def article_handler(context: PydanticAiCrawlingContext) -> None:
+        # The first page generates selectors; later pages reuse them with no LLM call.
+        article = await context.extract(Article)
+
+        await context.push_data(article.model_dump())
+
+    await crawler.run(['https://crawlee.dev/blog'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())