Spaces:
Paused
Paused
| import json | |
| import os | |
| import sys | |
| import traceback | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import io | |
| import os | |
| from test_streaming import streaming_format_tests | |
| sys.path.insert( | |
| 0, os.path.abspath("../..") | |
| ) # Adds the parent directory to the system path | |
| import os | |
| from unittest.mock import AsyncMock, MagicMock, patch | |
| import pytest | |
| import litellm | |
| from litellm import RateLimitError, Timeout, completion, completion_cost, embedding | |
| from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler | |
| from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt | |
| from test_amazing_vertex_completion import load_vertex_ai_credentials | |
| # litellm.num_retries =3 | |
| litellm.cache = None | |
| litellm.success_callback = [] | |
| user_message = "Write a short poem about the sky" | |
| messages = [{"content": user_message, "role": "user"}] | |
| def logger_fn(user_model_dict): | |
| print(f"user_model_dict: {user_model_dict}") | |
| def reset_callbacks(): | |
| print("\npytest fixture - resetting callbacks") | |
| litellm.success_callback = [] | |
| litellm._async_success_callback = [] | |
| litellm.failure_callback = [] | |
| litellm.callbacks = [] | |
| async def test_litellm_anthropic_prompt_caching_tools(): | |
| # Arrange: Set up the MagicMock for the httpx.AsyncClient | |
| mock_response = AsyncMock() | |
| def return_val(): | |
| return { | |
| "id": "msg_01XFDUDYJgAACzvnptvVoYEL", | |
| "type": "message", | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": "Hello!"}], | |
| "model": "claude-3-5-sonnet-20240620", | |
| "stop_reason": "end_turn", | |
| "stop_sequence": None, | |
| "usage": {"input_tokens": 12, "output_tokens": 6}, | |
| } | |
| mock_response.json = return_val | |
| mock_response.headers = {"key": "value"} | |
| litellm.set_verbose = True | |
| with patch( | |
| "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", | |
| return_value=mock_response, | |
| ) as mock_post: | |
| # Act: Call the litellm.acompletion function | |
| response = await litellm.acompletion( | |
| api_key="mock_api_key", | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| messages=[ | |
| {"role": "user", "content": "What's the weather like in Boston today?"} | |
| ], | |
| tools=[ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_current_weather", | |
| "description": "Get the current weather in a given location", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "location": { | |
| "type": "string", | |
| "description": "The city and state, e.g. San Francisco, CA", | |
| }, | |
| "unit": { | |
| "type": "string", | |
| "enum": ["celsius", "fahrenheit"], | |
| }, | |
| }, | |
| "required": ["location"], | |
| }, | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| } | |
| ], | |
| extra_headers={ | |
| "anthropic-version": "2023-06-01", | |
| "anthropic-beta": "prompt-caching-2024-07-31", | |
| }, | |
| ) | |
| # Print what was called on the mock | |
| print("call args=", mock_post.call_args) | |
| expected_url = "https://api.anthropic.com/v1/messages" | |
| expected_headers = { | |
| "accept": "application/json", | |
| "content-type": "application/json", | |
| "anthropic-version": "2023-06-01", | |
| "anthropic-beta": "prompt-caching-2024-07-31", | |
| "x-api-key": "mock_api_key", | |
| } | |
| expected_json = { | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What's the weather like in Boston today?", | |
| } | |
| ], | |
| } | |
| ], | |
| "tools": [ | |
| { | |
| "name": "get_current_weather", | |
| "description": "Get the current weather in a given location", | |
| "cache_control": {"type": "ephemeral"}, | |
| "input_schema": { | |
| "type": "object", | |
| "properties": { | |
| "location": { | |
| "type": "string", | |
| "description": "The city and state, e.g. San Francisco, CA", | |
| }, | |
| "unit": { | |
| "type": "string", | |
| "enum": ["celsius", "fahrenheit"], | |
| }, | |
| }, | |
| "required": ["location"], | |
| }, | |
| } | |
| ], | |
| "max_tokens": 4096, | |
| "model": "claude-3-5-sonnet-20240620", | |
| } | |
| mock_post.assert_called_once_with( | |
| expected_url, json=expected_json, headers=expected_headers, timeout=600.0 | |
| ) | |
| def anthropic_messages(): | |
| return [ | |
| # System Message | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement" * 400, | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| ] | |
| async def test_anthropic_vertex_ai_prompt_caching(anthropic_messages, sync_mode): | |
| litellm._turn_on_debug() | |
| from litellm.llms.custom_httpx.http_handler import HTTPHandler, AsyncHTTPHandler | |
| load_vertex_ai_credentials() | |
| client = HTTPHandler() if sync_mode else AsyncHTTPHandler() | |
| with patch.object(client, "post", return_value=MagicMock()) as mock_post: | |
| try: | |
| if sync_mode: | |
| response = completion( | |
| model="vertex_ai/claude-3-5-sonnet-v2@20241022 ", | |
| messages=anthropic_messages, | |
| client=client, | |
| ) | |
| else: | |
| response = await litellm.acompletion( | |
| model="vertex_ai/claude-3-5-sonnet-v2@20241022 ", | |
| messages=anthropic_messages, | |
| client=client, | |
| ) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| mock_post.assert_called_once() | |
| print(mock_post.call_args.kwargs["headers"]) | |
| assert "anthropic-beta" not in mock_post.call_args.kwargs["headers"] | |
| async def test_anthropic_api_prompt_caching_basic(): | |
| litellm.set_verbose = True | |
| response = await litellm.acompletion( | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| messages=[ | |
| # System Message | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement" | |
| * 400, | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| ], | |
| temperature=0.2, | |
| max_tokens=10, | |
| extra_headers={ | |
| "anthropic-version": "2023-06-01", | |
| "anthropic-beta": "prompt-caching-2024-07-31", | |
| }, | |
| ) | |
| print("response=", response) | |
| assert "cache_read_input_tokens" in response.usage | |
| assert "cache_creation_input_tokens" in response.usage | |
| # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl | |
| assert (response.usage.cache_read_input_tokens > 0) or ( | |
| response.usage.cache_creation_input_tokens > 0 | |
| ) | |
| async def test_anthropic_api_prompt_caching_with_content_str(): | |
| system_message = [ | |
| { | |
| "role": "system", | |
| "content": "Here is the full text of a complex legal agreement", | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| ] | |
| translated_system_message = litellm.AnthropicConfig().translate_system_message( | |
| messages=system_message | |
| ) | |
| assert translated_system_message == [ | |
| # System Message | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ] | |
| user_messages = [ | |
| # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. | |
| { | |
| "role": "user", | |
| "content": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| ] | |
| translated_messages = anthropic_messages_pt( | |
| messages=user_messages, | |
| model="claude-3-5-sonnet-20240620", | |
| llm_provider="anthropic", | |
| ) | |
| expected_messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| } | |
| ], | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| ] | |
| assert len(translated_messages) == len(expected_messages) | |
| for idx, i in enumerate(translated_messages): | |
| assert ( | |
| i == expected_messages[idx] | |
| ), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx]) | |
| async def test_anthropic_api_prompt_caching_no_headers(): | |
| litellm.set_verbose = True | |
| response = await litellm.acompletion( | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| messages=[ | |
| # System Message | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement" | |
| * 400, | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| ], | |
| temperature=0.2, | |
| max_tokens=10, | |
| ) | |
| print("response=", response) | |
| assert "cache_read_input_tokens" in response.usage | |
| assert "cache_creation_input_tokens" in response.usage | |
| # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl | |
| assert (response.usage.cache_read_input_tokens > 0) or ( | |
| response.usage.cache_creation_input_tokens > 0 | |
| ) | |
| async def test_anthropic_api_prompt_caching_streaming(): | |
| response = await litellm.acompletion( | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| messages=[ | |
| # System Message | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement" | |
| * 400, | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", | |
| }, | |
| # The final turn is marked with cache-control, for continuing in followups. | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "What are the key terms and conditions in this agreement?", | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| }, | |
| ], | |
| temperature=0.2, | |
| max_tokens=10, | |
| stream=True, | |
| stream_options={"include_usage": True}, | |
| ) | |
| idx = 0 | |
| is_cache_read_input_tokens_in_usage = False | |
| is_cache_creation_input_tokens_in_usage = False | |
| async for chunk in response: | |
| streaming_format_tests(idx=idx, chunk=chunk) | |
| # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl | |
| if hasattr(chunk, "usage"): | |
| print("Received final usage - {}".format(chunk.usage)) | |
| if hasattr(chunk, "usage") and hasattr(chunk.usage, "cache_read_input_tokens"): | |
| is_cache_read_input_tokens_in_usage = True | |
| if hasattr(chunk, "usage") and hasattr( | |
| chunk.usage, "cache_creation_input_tokens" | |
| ): | |
| is_cache_creation_input_tokens_in_usage = True | |
| idx += 1 | |
| print("response=", response) | |
| assert ( | |
| is_cache_read_input_tokens_in_usage and is_cache_creation_input_tokens_in_usage | |
| ) | |
| async def test_litellm_anthropic_prompt_caching_system(): | |
| # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples | |
| # LArge Context Caching Example | |
| mock_response = AsyncMock() | |
| def return_val(): | |
| return { | |
| "id": "msg_01XFDUDYJgAACzvnptvVoYEL", | |
| "type": "message", | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": "Hello!"}], | |
| "model": "claude-3-5-sonnet-20240620", | |
| "stop_reason": "end_turn", | |
| "stop_sequence": None, | |
| "usage": {"input_tokens": 12, "output_tokens": 6}, | |
| } | |
| mock_response.json = return_val | |
| mock_response.headers = {"key": "value"} | |
| litellm.set_verbose = True | |
| with patch( | |
| "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", | |
| return_value=mock_response, | |
| ) as mock_post: | |
| # Act: Call the litellm.acompletion function | |
| response = await litellm.acompletion( | |
| api_key="mock_api_key", | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "You are an AI assistant tasked with analyzing legal documents.", | |
| }, | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement", | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| ], | |
| }, | |
| { | |
| "role": "user", | |
| "content": "what are the key terms and conditions in this agreement?", | |
| }, | |
| ], | |
| extra_headers={ | |
| "anthropic-version": "2023-06-01", | |
| "anthropic-beta": "prompt-caching-2024-07-31", | |
| }, | |
| ) | |
| # Print what was called on the mock | |
| print("call args=", mock_post.call_args) | |
| expected_url = "https://api.anthropic.com/v1/messages" | |
| expected_headers = { | |
| "accept": "application/json", | |
| "content-type": "application/json", | |
| "anthropic-version": "2023-06-01", | |
| "anthropic-beta": "prompt-caching-2024-07-31", | |
| "x-api-key": "mock_api_key", | |
| } | |
| expected_json = { | |
| "system": [ | |
| { | |
| "type": "text", | |
| "text": "You are an AI assistant tasked with analyzing legal documents.", | |
| }, | |
| { | |
| "type": "text", | |
| "text": "Here is the full text of a complex legal agreement", | |
| "cache_control": {"type": "ephemeral"}, | |
| }, | |
| ], | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "what are the key terms and conditions in this agreement?", | |
| } | |
| ], | |
| } | |
| ], | |
| "max_tokens": 4096, | |
| "model": "claude-3-5-sonnet-20240620", | |
| } | |
| mock_post.assert_called_once_with( | |
| expected_url, json=expected_json, headers=expected_headers, timeout=600.0 | |
| ) | |
| def test_is_prompt_caching_enabled(anthropic_messages): | |
| assert litellm.utils.is_prompt_caching_valid_prompt( | |
| messages=anthropic_messages, | |
| tools=None, | |
| custom_llm_provider="anthropic", | |
| model="anthropic/claude-3-5-sonnet-20240620", | |
| ) | |
| async def test_router_prompt_caching_model_stored( | |
| messages, expected_model_id, anthropic_messages | |
| ): | |
| """ | |
| If a model is called with prompt caching supported, then the model id should be stored in the router cache. | |
| """ | |
| import asyncio | |
| from litellm.router import Router | |
| from litellm.router_utils.prompt_caching_cache import PromptCachingCache | |
| router = Router( | |
| model_list=[ | |
| { | |
| "model_name": "claude-model", | |
| "litellm_params": { | |
| "model": "anthropic/claude-3-5-sonnet-20240620", | |
| "api_key": os.environ.get("ANTHROPIC_API_KEY"), | |
| }, | |
| "model_info": {"id": "1234"}, | |
| } | |
| ] | |
| ) | |
| if messages == "anthropic_messages": | |
| _messages = anthropic_messages | |
| else: | |
| _messages = [{"role": "user", "content": "Hello"}] | |
| await router.acompletion( | |
| model="claude-model", | |
| messages=_messages, | |
| mock_response="The sky is blue.", | |
| ) | |
| await asyncio.sleep(1) | |
| cache = PromptCachingCache( | |
| cache=router.cache, | |
| ) | |
| cached_model_id = cache.get_model_id(messages=_messages, tools=None) | |
| if expected_model_id: | |
| assert cached_model_id["model_id"] == "1234" | |
| else: | |
| assert cached_model_id is None | |
| # @pytest.mark.skip( | |
| # reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet" | |
| # ) | |
| async def test_router_with_prompt_caching(anthropic_messages): | |
| """ | |
| if prompt caching supported model called with prompt caching valid prompt, | |
| then 2nd call should go to the same model. | |
| """ | |
| from litellm.router import Router | |
| import asyncio | |
| from litellm.router_utils.prompt_caching_cache import PromptCachingCache | |
| router = Router( | |
| model_list=[ | |
| { | |
| "model_name": "claude-model", | |
| "litellm_params": { | |
| "model": "anthropic/claude-3-5-sonnet-20240620", | |
| "api_key": os.environ.get("ANTHROPIC_API_KEY"), | |
| "mock_response": "The sky is blue.", | |
| }, | |
| }, | |
| { | |
| "model_name": "claude-model", | |
| "litellm_params": { | |
| "model": "anthropic.claude-3-5-sonnet-20241022-v2:0", | |
| "mock_response": "The sky is green.", | |
| }, | |
| }, | |
| ], | |
| optional_pre_call_checks=["prompt_caching"], | |
| ) | |
| response = await router.acompletion( | |
| messages=anthropic_messages, | |
| model="claude-model", | |
| mock_response="The sky is blue.", | |
| ) | |
| print("response=", response) | |
| initial_model_id = response._hidden_params["model_id"] | |
| await asyncio.sleep(1) | |
| cache = PromptCachingCache( | |
| cache=router.cache, | |
| ) | |
| cached_model_id = cache.get_model_id(messages=anthropic_messages, tools=None) | |
| assert cached_model_id is not None | |
| prompt_caching_cache_key = PromptCachingCache.get_prompt_caching_cache_key( | |
| messages=anthropic_messages, tools=None | |
| ) | |
| print(f"prompt_caching_cache_key: {prompt_caching_cache_key}") | |
| assert cached_model_id["model_id"] == initial_model_id | |
| new_messages = anthropic_messages + [ | |
| {"role": "user", "content": "What is the weather in SF?"} | |
| ] | |
| for _ in range(20): | |
| response = await router.acompletion( | |
| messages=new_messages, | |
| model="claude-model", | |
| mock_response="The sky is blue.", | |
| ) | |
| print("response=", response) | |
| assert response._hidden_params["model_id"] == initial_model_id | |