# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors # # SPDX-License-Identifier: Apache-2.0 from typing import Any, Optional, Union import requests from pydantic import BaseModel from autogen.import_utils import optional_import_block, require_optional_import from autogen.tools import Tool with optional_import_block(): import wikipediaapi # Maximum allowed length for a query string. MAX_QUERY_LENGTH = 300 # Maximum number of pages to retrieve from a search. MAX_PAGE_RETRIEVE = 100 # Maximum number of characters to return from a Wikipedia page. MAX_ARTICLE_LENGTH = 10000 class Document(BaseModel): """Pydantic model representing a Wikipedia document. Attributes: page_content (str): Textual content of the Wikipedia page (possibly truncated). metadata (dict[str, str]): Additional info, including: - source URL - title - pageid - timestamp - word count - size """ page_content: str metadata: dict[str, str] class WikipediaClient: """Client for interacting with the Wikipedia API. Supports searching and page retrieval on a specified language edition. Public methods: search(query: str, limit: int) -> list[dict[str, Any]] get_page(title: str) -> Optional[wikipediaapi.WikipediaPage] Attributes: base_url (str): URL of the MediaWiki API endpoint. headers (dict[str, str]): HTTP headers, including User-Agent. wiki (wikipediaapi.Wikipedia): Low-level Wikipedia API client. """ def __init__(self, language: str = "en", tool_name: str = "wikipedia-client") -> None: """Initialize the WikipediaClient. Args: language (str): ISO code of the Wikipedia edition (e.g., 'en', 'es'). tool_name (str): Identifier for User-Agent header. """ self.base_url = f"https://{language}.wikipedia.org/w/api.php" self.headers = {"User-Agent": f"autogen.Agent ({tool_name})"} self.wiki = wikipediaapi.Wikipedia( language=language, extract_format=wikipediaapi.ExtractFormat.WIKI, user_agent=f"autogen.Agent ({tool_name})", ) def search(self, query: str, limit: int = 3) -> Any: """Search Wikipedia for pages matching a query string. Args: query (str): The search keywords. limit (int): Max number of results to return. Returns: list[dict[str, Any]]: Each dict has keys: - 'title' (str) - 'size' (int) - 'wordcount' (int) - 'timestamp' (str) Raises: requests.HTTPError: If the HTTP request to the API fails. """ params = { "action": "query", "format": "json", "list": "search", "srsearch": query, "srlimit": str(limit), "srprop": "size|wordcount|timestamp", } response = requests.get(url=self.base_url, params=params, headers=self.headers) response.raise_for_status() data = response.json() search_data = data.get("query", {}).get("search", []) return search_data def get_page(self, title: str) -> Optional[Any]: """Retrieve a WikipediaPage object by title. Args: title (str): Title of the Wikipedia page. Returns: wikipediaapi.WikipediaPage | None: The page object if it exists, otherwise None. Raises: wikipediaapi.WikipediaException: On lower‑level API errors. """ page = self.wiki.page(title) if not page.exists(): return None return page @require_optional_import(["wikipediaapi"], "wikipedia") class WikipediaQueryRunTool(Tool): """Tool for querying Wikipedia and returning summarized page results. This tool uses the `wikipediaapi` package to perform searches against a specified language edition of Wikipedia and returns up to `top_k` page summaries. Public methods: query_run(query: str) -> list[str] | str Attributes: language (str): Language code for the Wikipedia edition (e.g., 'en', 'es'). top_k (int): Max number of page summaries returned (≤ MAX_PAGE_RETRIEVE). verbose (bool): If True, enables debug logging to stdout. wiki_cli (WikipediaClient): Internal client for Wikipedia API calls. """ def __init__(self, language: str = "en", top_k: int = 3, verbose: bool = False) -> None: """Initialize the WikipediaQueryRunTool. Args: language (str): ISO code of the Wikipedia edition to query. top_k (int): Desired number of summaries (capped by MAX_PAGE_RETRIEVE). verbose (bool): If True, print debug information during searches. """ self.language = language self.tool_name = "wikipedia-query-run" self.wiki_cli = WikipediaClient(language, self.tool_name) self.top_k = min(top_k, MAX_PAGE_RETRIEVE) self.verbose = verbose super().__init__( name=self.tool_name, description="Run a Wikipedia query and return page summaries.", func_or_tool=self.query_run, ) def query_run(self, query: str) -> Union[list[str], str]: """Search Wikipedia and return formatted page summaries. Truncates `query` to MAX_QUERY_LENGTH before searching. Args: query (str): Search term(s) to look up in Wikipedia. Returns: list[str]: Each element is "Page: