mirrored 7 minutes ago
0
Linxin SongCoACT initialize (#292) b968155
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
#
# SPDX-License-Identifier: Apache-2.0

from typing import Annotated, Any, Optional, Union

from pydantic import BaseModel

from ....doc_utils import export_module
from ....import_utils import optional_import_block, require_optional_import
from ....interop import LiteLLmConfigFactory
from ....llm_config import LLMConfig
from ... import Tool
from ...dependency_injection import Depends, on

with optional_import_block():
    from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
    from crawl4ai.extraction_strategy import LLMExtractionStrategy

__all__ = ["Crawl4AITool"]


@require_optional_import(["crawl4ai"], "crawl4ai")
@export_module("autogen.tools.experimental")
class Crawl4AITool(Tool):
    """
    Crawl a website and extract information using the crawl4ai library.
    """

    def __init__(
        self,
        llm_config: Optional[Union[LLMConfig, dict[str, Any]]] = None,
        extraction_model: Optional[type[BaseModel]] = None,
        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
    ) -> None:
        """
        Initialize the Crawl4AITool.

        Args:
            llm_config: The config dictionary for the LLM model. If None, the tool will run without LLM.
            extraction_model: The Pydantic model to use for extraction. If None, the tool will use the default schema.
            llm_strategy_kwargs: The keyword arguments to pass to the LLM extraction strategy.
        """
        Crawl4AITool._validate_llm_strategy_kwargs(llm_strategy_kwargs, llm_config_provided=(llm_config is not None))

        async def crawl4ai_helper(  # type: ignore[no-any-unimported]
            url: str,
            browser_cfg: Optional["BrowserConfig"] = None,
            crawl_config: Optional["CrawlerRunConfig"] = None,
        ) -> Any:
            async with AsyncWebCrawler(config=browser_cfg) as crawler:
                result = await crawler.arun(
                    url=url,
                    config=crawl_config,
                )

            if crawl_config is None:
                response = result.markdown
            else:
                response = result.extracted_content if result.success else result.error_message

            return response

        async def crawl4ai_without_llm(
            url: Annotated[str, "The url to crawl and extract information from."],
        ) -> Any:
            return await crawl4ai_helper(url=url)

        async def crawl4ai_with_llm(
            url: Annotated[str, "The url to crawl and extract information from."],
            instruction: Annotated[str, "The instruction to provide on how and what to extract."],
            llm_config: Annotated[Any, Depends(on(llm_config))],
            llm_strategy_kwargs: Annotated[Optional[dict[str, Any]], Depends(on(llm_strategy_kwargs))],
            extraction_model: Annotated[Optional[type[BaseModel]], Depends(on(extraction_model))],
        ) -> Any:
            browser_cfg = BrowserConfig(headless=True)
            crawl_config = Crawl4AITool._get_crawl_config(
                llm_config=llm_config,
                instruction=instruction,
                extraction_model=extraction_model,
                llm_strategy_kwargs=llm_strategy_kwargs,
            )

            return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)

        super().__init__(
            name="crawl4ai",
            description="Crawl a website and extract information.",
            func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm,
        )

    @staticmethod
    def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool) -> None:
        if not llm_strategy_kwargs:
            return

        if not llm_config_provided:
            raise ValueError("llm_strategy_kwargs can only be provided if llm_config is also provided.")

        check_parameters_error_msg = "".join(
            f"'{key}' should not be provided in llm_strategy_kwargs. It is automatically set based on llm_config.\n"
            for key in ["provider", "api_token"]
            if key in llm_strategy_kwargs
        )

        check_parameters_error_msg += "".join(
            "'schema' should not be provided in llm_strategy_kwargs. It is automatically set based on extraction_model type.\n"
            if "schema" in llm_strategy_kwargs
            else ""
        )

        check_parameters_error_msg += "".join(
            "'instruction' should not be provided in llm_strategy_kwargs. It is provided at the time of calling the tool.\n"
            if "instruction" in llm_strategy_kwargs
            else ""
        )

        if check_parameters_error_msg:
            raise ValueError(check_parameters_error_msg)

    @staticmethod
    def _get_crawl_config(  # type: ignore[no-any-unimported]
        llm_config: Union[LLMConfig, dict[str, Any]],
        instruction: str,
        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
        extraction_model: Optional[type[BaseModel]] = None,
    ) -> "CrawlerRunConfig":
        lite_llm_config = LiteLLmConfigFactory.create_lite_llm_config(llm_config)

        if llm_strategy_kwargs is None:
            llm_strategy_kwargs = {}

        schema = (
            extraction_model.model_json_schema()
            if (extraction_model and issubclass(extraction_model, BaseModel))
            else None
        )

        extraction_type = llm_strategy_kwargs.pop("extraction_type", "schema" if schema else "block")

        # 1. Define the LLM extraction strategy
        llm_strategy = LLMExtractionStrategy(
            **lite_llm_config,
            schema=schema,
            extraction_type=extraction_type,
            instruction=instruction,
            **llm_strategy_kwargs,
        )

        # 2. Build the crawler config
        crawl_config = CrawlerRunConfig(extraction_strategy=llm_strategy, cache_mode=CacheMode.BYPASS)

        return crawl_config