OS-World/xiangyi-li · BenchFlow

mirrored 18 minutes ago
Benchmark Card Files and versions Leaderboard
Linxin SongCoACT initialize (#292) b968155
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
#
# SPDX-License-Identifier: Apache-2.0
#
# Portions derived from  https://github.com/microsoft/autogen are under the MIT License.
# SPDX-License-Identifier: MIT
"""Create an OpenAI-compatible client using Together.AI's API.

Example:
    ```python
    llm_config = {
        "config_list": [
            {
                "api_type": "together",
                "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
                "api_key": os.environ.get("TOGETHER_API_KEY"),
            }
        ]
    }

    agent = autogen.AssistantAgent("my_agent", llm_config=llm_config)
    ```

Install Together.AI python library using: pip install --upgrade together

Resources:
- https://docs.together.ai/docs/inference-python
"""

from __future__ import annotations

import copy
import os
import time
import warnings
from typing import Any, Literal, Optional, Union

from pydantic import Field

from ..import_utils import optional_import_block, require_optional_import
from ..llm_config import LLMConfigEntry, register_llm_config
from .client_utils import should_hide_tools, validate_parameter
from .oai_models import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageToolCall, Choice, CompletionUsage

with optional_import_block():
    from together import Together


@register_llm_config
class TogetherLLMConfigEntry(LLMConfigEntry):
    api_type: Literal["together"] = "together"
    max_tokens: int = Field(default=512, ge=0)
    stream: bool = False
    temperature: Optional[float] = Field(default=None)
    top_p: Optional[float] = Field(default=None)
    top_k: Optional[int] = Field(default=None)
    repetition_penalty: Optional[float] = Field(default=None)
    presence_penalty: Optional[float] = Field(default=None, ge=-2, le=2)
    frequency_penalty: Optional[float] = Field(default=None, ge=-2, le=2)
    min_p: Optional[float] = Field(default=None, ge=0, le=1)
    safety_model: Optional[str] = None
    hide_tools: Literal["if_all_run", "if_any_run", "never"] = "never"
    price: Optional[list[float]] = Field(default=None, min_length=2, max_length=2)
    tool_choice: Optional[Union[str, dict[str, Union[str, dict[str, str]]]]] = (
        None  # dict is the tool to call: {"type": "function", "function": {"name": "my_function"}}
    )

    def create_client(self):
        raise NotImplementedError("TogetherLLMConfigEntry.create_client is not implemented.")


class TogetherClient:
    """Client for Together.AI's API."""

    def __init__(self, **kwargs):
        """Requires api_key or environment variable to be set

        Args:
            **kwargs: Additional keyword arguments to pass to the client.
        """
        # Ensure we have the api_key upon instantiation
        self.api_key = kwargs.get("api_key")
        if not self.api_key:
            self.api_key = os.getenv("TOGETHER_API_KEY")

        if "response_format" in kwargs and kwargs["response_format"] is not None:
            warnings.warn("response_format is not supported for Together.AI, it will be ignored.", UserWarning)

        assert self.api_key, (
            "Please include the api_key in your config list entry for Together.AI or set the TOGETHER_API_KEY env variable."
        )

    def message_retrieval(self, response) -> list:
        """Retrieve and return a list of strings or a list of Choice.Message from the response.

        NOTE: if a list of Choice.Message is returned, it currently needs to contain the fields of OpenAI's ChatCompletion Message object,
        since that is expected for function or tool calling in the rest of the codebase at the moment, unless a custom agent is being used.
        """
        return [choice.message for choice in response.choices]

    def cost(self, response) -> float:
        return response.cost

    @staticmethod
    def get_usage(response) -> dict:
        """Return usage summary of the response using RESPONSE_USAGE_KEYS."""
        # ...  # pragma: no cover
        return {
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens,
            "total_tokens": response.usage.total_tokens,
            "cost": response.cost,
            "model": response.model,
        }

    def parse_params(self, params: dict[str, Any]) -> dict[str, Any]:
        """Loads the parameters for Together.AI API from the passed in parameters and returns a validated set. Checks types, ranges, and sets defaults"""
        together_params = {}

        # Check that we have what we need to use Together.AI's API
        together_params["model"] = params.get("model")
        assert together_params["model"], (
            "Please specify the 'model' in your config list entry to nominate the Together.AI model to use."
        )

        # Validate allowed Together.AI parameters
        # https://github.com/togethercomputer/together-python/blob/94ffb30daf0ac3e078be986af7228f85f79bde99/src/together/resources/completions.py#L44
        together_params["max_tokens"] = validate_parameter(params, "max_tokens", int, True, 512, (0, None), None)
        together_params["stream"] = validate_parameter(params, "stream", bool, False, False, None, None)
        together_params["temperature"] = validate_parameter(params, "temperature", (int, float), True, None, None, None)
        together_params["top_p"] = validate_parameter(params, "top_p", (int, float), True, None, None, None)
        together_params["top_k"] = validate_parameter(params, "top_k", int, True, None, None, None)
        together_params["repetition_penalty"] = validate_parameter(
            params, "repetition_penalty", float, True, None, None, None
        )
        together_params["presence_penalty"] = validate_parameter(
            params, "presence_penalty", (int, float), True, None, (-2, 2), None
        )
        together_params["frequency_penalty"] = validate_parameter(
            params, "frequency_penalty", (int, float), True, None, (-2, 2), None
        )
        together_params["min_p"] = validate_parameter(params, "min_p", (int, float), True, None, (0, 1), None)
        together_params["safety_model"] = validate_parameter(
            params, "safety_model", str, True, None, None, None
        )  # We won't enforce the available models as they are likely to change

        # Check if they want to stream and use tools, which isn't currently supported (TODO)
        if together_params["stream"] and "tools" in params:
            warnings.warn(
                "Streaming is not supported when using tools, streaming will be disabled.",
                UserWarning,
            )

            together_params["stream"] = False

        if "tool_choice" in params:
            together_params["tool_choice"] = params["tool_choice"]

        return together_params

    @require_optional_import("together", "together")
    def create(self, params: dict) -> ChatCompletion:
        messages = params.get("messages", [])

        # Convert AG2 messages to Together.AI messages
        together_messages = oai_messages_to_together_messages(messages)

        # Parse parameters to Together.AI API's parameters
        together_params = self.parse_params(params)

        # Add tools to the call if we have them and aren't hiding them
        if "tools" in params:
            hide_tools = validate_parameter(
                params, "hide_tools", str, False, "never", None, ["if_all_run", "if_any_run", "never"]
            )
            if not should_hide_tools(together_messages, params["tools"], hide_tools):
                together_params["tools"] = params["tools"]

        together_params["messages"] = together_messages

        # We use chat model by default
        client = Together(api_key=self.api_key)

        # Token counts will be returned
        prompt_tokens = 0
        completion_tokens = 0
        total_tokens = 0

        response = client.chat.completions.create(**together_params)
        if together_params["stream"]:
            # Read in the chunks as they stream
            ans = ""
            for chunk in response:
                ans = ans + (chunk.choices[0].delta.content or "")

            prompt_tokens = chunk.usage.prompt_tokens
            completion_tokens = chunk.usage.completion_tokens
            total_tokens = chunk.usage.total_tokens
        else:
            ans: str = response.choices[0].message.content

            prompt_tokens = response.usage.prompt_tokens
            completion_tokens = response.usage.completion_tokens
            total_tokens = response.usage.total_tokens

        if response.choices[0].finish_reason == "tool_calls":
            together_finish = "tool_calls"
            tool_calls = []
            for tool_call in response.choices[0].message.tool_calls:
                tool_calls.append(
                    ChatCompletionMessageToolCall(
                        id=tool_call.id,
                        function={"name": tool_call.function.name, "arguments": tool_call.function.arguments},
                        type="function",
                    )
                )
        else:
            together_finish = "stop"
            tool_calls = None

        # 3. convert output
        message = ChatCompletionMessage(
            role="assistant",
            content=response.choices[0].message.content,
            function_call=None,
            tool_calls=tool_calls,
        )
        choices = [Choice(finish_reason=together_finish, index=0, message=message)]

        response_oai = ChatCompletion(
            id=response.id,
            model=together_params["model"],
            created=int(time.time()),
            object="chat.completion",
            choices=choices,
            usage=CompletionUsage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=total_tokens,
            ),
            cost=calculate_together_cost(prompt_tokens, completion_tokens, together_params["model"]),
        )

        return response_oai


def oai_messages_to_together_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Convert messages from OAI format to Together.AI format.
    We correct for any specific role orders and types.
    """
    together_messages = copy.deepcopy(messages)

    # If we have a message with role='tool', which occurs when a function is executed, change it to 'user'
    for msg in together_messages:
        if "role" in msg and msg["role"] == "tool":
            msg["role"] = "user"

    return together_messages


# MODELS AND COSTS
chat_lang_code_model_sizes = {
    "zero-one-ai/Yi-34B-Chat": 34,
    "allenai/OLMo-7B-Instruct": 7,
    "allenai/OLMo-7B-Twin-2T": 7,
    "allenai/OLMo-7B": 7,
    "Austism/chronos-hermes-13b": 13,
    "deepseek-ai/deepseek-coder-33b-instruct": 33,
    "deepseek-ai/deepseek-llm-67b-chat": 67,
    "garage-bAInd/Platypus2-70B-instruct": 70,
    "google/gemma-2b-it": 2,
    "google/gemma-7b-it": 7,
    "Gryphe/MythoMax-L2-13b": 13,
    "lmsys/vicuna-13b-v1.5": 13,
    "lmsys/vicuna-7b-v1.5": 7,
    "codellama/CodeLlama-13b-Instruct-hf": 13,
    "codellama/CodeLlama-34b-Instruct-hf": 34,
    "codellama/CodeLlama-70b-Instruct-hf": 70,
    "codellama/CodeLlama-7b-Instruct-hf": 7,
    "meta-llama/Llama-2-70b-chat-hf": 70,
    "meta-llama/Llama-2-13b-chat-hf": 13,
    "meta-llama/Llama-2-7b-chat-hf": 7,
    "meta-llama/Llama-3-8b-chat-hf": 8,
    "meta-llama/Llama-3-70b-chat-hf": 70,
    "mistralai/Mistral-7B-Instruct-v0.1": 7,
    "mistralai/Mistral-7B-Instruct-v0.2": 7,
    "mistralai/Mistral-7B-Instruct-v0.3": 7,
    "NousResearch/Nous-Capybara-7B-V1p9": 7,
    "NousResearch/Nous-Hermes-llama-2-7b": 7,
    "NousResearch/Nous-Hermes-Llama2-13b": 13,
    "NousResearch/Nous-Hermes-2-Yi-34B": 34,
    "openchat/openchat-3.5-1210": 7,
    "Open-Orca/Mistral-7B-OpenOrca": 7,
    "Qwen/Qwen1.5-0.5B-Chat": 0.5,
    "Qwen/Qwen1.5-1.8B-Chat": 1.8,
    "Qwen/Qwen1.5-4B-Chat": 4,
    "Qwen/Qwen1.5-7B-Chat": 7,
    "Qwen/Qwen1.5-14B-Chat": 14,
    "Qwen/Qwen1.5-32B-Chat": 32,
    "Qwen/Qwen1.5-72B-Chat": 72,
    "Qwen/Qwen1.5-110B-Chat": 110,
    "Qwen/Qwen2-72B-Instruct": 72,
    "snorkelai/Snorkel-Mistral-PairRM-DPO": 7,
    "togethercomputer/alpaca-7b": 7,
    "teknium/OpenHermes-2-Mistral-7B": 7,
    "teknium/OpenHermes-2p5-Mistral-7B": 7,
    "togethercomputer/Llama-2-7B-32K-Instruct": 7,
    "togethercomputer/RedPajama-INCITE-Chat-3B-v1": 3,
    "togethercomputer/RedPajama-INCITE-7B-Chat": 7,
    "togethercomputer/StripedHyena-Nous-7B": 7,
    "Undi95/ReMM-SLERP-L2-13B": 13,
    "Undi95/Toppy-M-7B": 7,
    "WizardLM/WizardLM-13B-V1.2": 13,
    "upstage/SOLAR-10.7B-Instruct-v1.0": 11,
}

# Cost per million tokens based on up to X Billion parameters, e.g. up 4B is $0.1/million
chat_lang_code_model_costs = {4: 0.1, 8: 0.2, 21: 0.3, 41: 0.8, 80: 0.9, 110: 1.8}

mixture_model_sizes = {
    "cognitivecomputations/dolphin-2.5-mixtral-8x7b": 56,
    "databricks/dbrx-instruct": 132,
    "mistralai/Mixtral-8x7B-Instruct-v0.1": 47,
    "mistralai/Mixtral-8x22B-Instruct-v0.1": 141,
    "NousResearch/Nous-Hermes-2-Mistral-7B-DPO": 7,
    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": 47,
    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT": 47,
    "Snowflake/snowflake-arctic-instruct": 480,
}

# Cost per million tokens based on up to X Billion parameters, e.g. up 56B is $0.6/million
mixture_costs = {56: 0.6, 176: 1.2, 480: 2.4}


def calculate_together_cost(input_tokens: int, output_tokens: int, model_name: str) -> float:
    """Cost calculation for inference"""
    if model_name in chat_lang_code_model_sizes or model_name in mixture_model_sizes:
        cost_per_mil = 0

        # Chat, Language, Code models
        if model_name in chat_lang_code_model_sizes:
            size_in_b = chat_lang_code_model_sizes[model_name]

            for top_size in chat_lang_code_model_costs:
                if size_in_b <= top_size:
                    cost_per_mil = chat_lang_code_model_costs[top_size]
                    break

        else:
            # Mixture-of-experts
            size_in_b = mixture_model_sizes[model_name]

            for top_size in mixture_costs:
                if size_in_b <= top_size:
                    cost_per_mil = mixture_costs[top_size]
                    break

        if cost_per_mil == 0:
            warnings.warn("Model size doesn't align with cost structure.", UserWarning)

        return cost_per_mil * ((input_tokens + output_tokens) / 1e6)

    else:
        # Model is not in our list of models, can't determine the cost
        warnings.warn(
            "The model isn't catered for costing, to apply costs you can use the 'price' key on your config_list.",
            UserWarning,
        )

        return 0