Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion docs/guides/architecture_overview.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class ParselCrawler

class BeautifulSoupCrawler

class PydanticAiCrawler

class PlaywrightCrawler

class AdaptivePlaywrightCrawler
Expand All @@ -65,18 +67,20 @@ BasicCrawler --|> AdaptivePlaywrightCrawler
AbstractHttpCrawler --|> HttpCrawler
AbstractHttpCrawler --|> ParselCrawler
AbstractHttpCrawler --|> BeautifulSoupCrawler
AbstractHttpCrawler --|> PydanticAiCrawler
PlaywrightCrawler --|> StagehandCrawler
```

### HTTP crawlers

HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).

HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category:
HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are four crawlers that belong to this category:

- <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.
- <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.
- <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required.
- <ApiLink to="class/PydanticAiCrawler">`PydanticAiCrawler`</ApiLink> parses HTML with Parsel and uses an LLM to extract structured data into a validated Pydantic model.

You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).

Expand Down Expand Up @@ -120,6 +124,8 @@ class ParselCrawlingContext

class BeautifulSoupCrawlingContext

class PydanticAiCrawlingContext

class PlaywrightPreNavCrawlingContext

class PlaywrightCrawlingContext
Expand Down Expand Up @@ -148,6 +154,8 @@ ParsedHttpCrawlingContext --|> ParselCrawlingContext

ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext

ParselCrawlingContext --|> PydanticAiCrawlingContext

BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext

PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext
Expand All @@ -168,6 +176,7 @@ They have a similar inheritance structure as the crawlers, with the base class b
- <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.
- <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
- <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.
- <ApiLink to="class/PydanticAiCrawlingContext">`PydanticAiCrawlingContext`</ApiLink> for the AI crawler, extending the Parsel context with an `extract` helper.
- <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated.
- <ApiLink to="class/PlaywrightCrawlingContext">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers.
- <ApiLink to="class/AdaptivePlaywrightPreNavCrawlingContext">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import asyncio

from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee.crawlers import PydanticAiCrawler, PydanticAiCrawlingContext


class Post(BaseModel):
"""Model representing a single post."""

title: str
url: str


class Posts(BaseModel):
"""Model representing the extracted list of posts."""

posts: list[Post]


async def main() -> None:
model = OpenAIChatModel(
'gpt-5.4-nano',
provider=OpenAIProvider(api_key='your-openai-api-key'),
)
crawler = PydanticAiCrawler(model=model, max_requests_per_crawl=5)

@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
# The instruction narrows what the model returns from the page.
posts = await context.extract(
Posts,
additional_instructions='Extract only the top five posts on the page.',
)

await context.push_data(posts.model_dump())

await crawler.run(['https://news.ycombinator.com'])


if __name__ == '__main__':
asyncio.run(main())
43 changes: 43 additions & 0 deletions docs/guides/code_examples/pydantic_ai_crawler/basic_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import asyncio

from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee.crawlers import PydanticAiCrawler, PydanticAiCrawlingContext


class Article(BaseModel):
"""Model representing the extracted data for an article."""

title: str
short_text: str


async def main() -> None:
# A `Model` instance sets the API key explicitly. A provider-prefixed string such as
# 'openai:gpt-5.4-nano' reads the key from the provider's env var like OPENAI_API_KEY.
model = OpenAIChatModel(
'gpt-5.4-nano',
provider=OpenAIProvider(api_key='your-openai-api-key'),
)

# With only `model`, the crawler uses a PydanticAiDirectExtractor by default.
crawler = PydanticAiCrawler(model=model, max_requests_per_crawl=5)

@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Pass a Pydantic model and get a validated instance back.
article = await context.extract(Article)

await context.push_data(article.model_dump())

await context.enqueue_links()

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import asyncio

from html_to_markdown import convert
from lxml_html_clean import Cleaner
from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee.crawlers import (
BasePydanticAiHtmlDistiller,
PydanticAiCrawler,
PydanticAiCrawlingContext,
PydanticAiDirectExtractor,
get_basic_http_cleaner,
)

# Notes appended to the model instructions so it knows the input format.
MARKDOWN_PROMPT_NOTES = 'The document is Markdown converted from the HTML page.'


class MarkdownDistiller(BasePydanticAiHtmlDistiller):
"""Distiller that cleans the page HTML and converts it to Markdown."""

def __init__(self, cleaner: Cleaner | None = None) -> None:
super().__init__(prompt_notes=MARKDOWN_PROMPT_NOTES)

# Strip scripts, styles, and other noise before the conversion.
self._cleaner = cleaner or get_basic_http_cleaner()

def distill(self, html: str) -> str:
return convert(self._cleaner.clean_html(html)).content or ''


class Article(BaseModel):
"""Model representing the extracted data for an article."""

title: str
short_text: str


async def main() -> None:
model = OpenAIChatModel(
'gpt-5.4-nano',
# Set the provider with the API key explicitly.
provider=OpenAIProvider(api_key='your-openai-api-key'),
)
crawler = PydanticAiCrawler(
# Use the custom distiller to convert the page to Markdown before extraction.
extractor=PydanticAiDirectExtractor(model=model, distiller=MarkdownDistiller()),
max_requests_per_crawl=5,
)

@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
# Pass a Pydantic model and get a validated instance back.
article = await context.extract(Article)
await context.push_data(article.model_dump())

# Enqueue links as usual, the distillation and extraction don't affect
# the rest of the crawling logic.
await context.enqueue_links()

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
74 changes: 74 additions & 0 deletions docs/guides/code_examples/pydantic_ai_crawler/debugging_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import asyncio

from pydantic import BaseModel
from pydantic_ai import capture_run_messages
from pydantic_ai.exceptions import UnexpectedModelBehavior
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee import ConcurrencySettings
from crawlee.crawlers import (
PydanticAiCleanHtmlDistiller,
PydanticAiCrawler,
PydanticAiCrawlingContext,
PydanticAiDirectExtractor,
)


class Article(BaseModel):
"""Model representing the extracted data for an article."""

title: str
short_text: str


async def main() -> None:
model = OpenAIChatModel(
'gpt-5.4-nano',
provider=OpenAIProvider(api_key='your-openai-api-key'),
)
# Build the distiller once so the extractor and the handler below share
# the same instance.
distiller = PydanticAiCleanHtmlDistiller()
crawler = PydanticAiCrawler(
max_requests_per_crawl=10,
# Create a direct extractor with your distiller.
extractor=PydanticAiDirectExtractor(
model,
distiller=distiller,
),
# Set concurrency to 1, which ensures only one request is processed at a time.
concurrency_settings=ConcurrencySettings(
desired_concurrency=1, max_concurrency=1
),
# Set abort_on_error to True to stop the crawl if an error occurs during
# extraction.
abort_on_error=True,
)

@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
# Inspect the distilled document the model actually reads, using the same
# distiller the extractor runs. On real pages this can be tens of KB.
distilled = distiller.distill(context.selector.get())
context.log.info(distilled)

# Capture the prompts, responses, and retries exchanged with the model.
with capture_run_messages() as messages:
try:
article = await context.extract(Article)
except UnexpectedModelBehavior:
context.log.exception(f'Extraction failed for {context.request.url}.')
raise
finally:
# Log each exchanged message on its own line for readability.
for message in messages:
context.log.info(f'{message}')

await context.push_data(article.model_dump())

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import asyncio

from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider

from crawlee import Glob
from crawlee.crawlers import (
PydanticAiCrawler,
PydanticAiCrawlingContext,
PydanticAiDirectExtractor,
PydanticAiSelectorExtractor,
)


class Article(BaseModel):
"""Model representing the extracted data for an article."""

title: str
main_text: str


async def main() -> None:
model = OpenAIChatModel(
'gpt-5.4-nano',
provider=OpenAIProvider(api_key='your-openai-api-key'),
)
crawler = PydanticAiCrawler(
extractor=PydanticAiSelectorExtractor(
model=model,
# Pages the cached selectors cannot handle fall back to direct extraction.
fallback=PydanticAiDirectExtractor(model=model),
),
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
# Enqueue blog article pages; the article handler extracts the data.
await context.enqueue_links(
include=[Glob('https://crawlee.dev/blog/*')],
label='article',
)

@crawler.router.handler('article')
async def article_handler(context: PydanticAiCrawlingContext) -> None:
# The first page generates selectors; later pages reuse them with no LLM call.
article = await context.extract(Article)

await context.push_data(article.model_dump())

await crawler.run(['https://crawlee.dev/blog'])


if __name__ == '__main__':
asyncio.run(main())
Loading
Loading