Spaces:
Paused
Paused
| # What is this? | |
| ## This tests if the proxy fallbacks work as expected | |
| import pytest | |
| import asyncio | |
| import aiohttp | |
| from tests.large_text import text | |
| import time | |
| from typing import Optional | |
| async def generate_key( | |
| session, | |
| i, | |
| models: list, | |
| calling_key="sk-1234", | |
| ): | |
| url = "http://0.0.0.0:4000/key/generate" | |
| headers = { | |
| "Authorization": f"Bearer {calling_key}", | |
| "Content-Type": "application/json", | |
| } | |
| data = { | |
| "models": models, | |
| } | |
| print(f"data: {data}") | |
| async with session.post(url, headers=headers, json=data) as response: | |
| status = response.status | |
| response_text = await response.text() | |
| print(f"Response {i} (Status code: {status}):") | |
| print(response_text) | |
| print() | |
| if status != 200: | |
| raise Exception(f"Request {i} did not return a 200 status code: {status}") | |
| return await response.json() | |
| async def chat_completion( | |
| session, | |
| key: str, | |
| model: str, | |
| messages: list, | |
| return_headers: bool = False, | |
| extra_headers: Optional[dict] = None, | |
| **kwargs, | |
| ): | |
| url = "http://0.0.0.0:4000/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {key}", | |
| "Content-Type": "application/json", | |
| } | |
| if extra_headers is not None: | |
| headers.update(extra_headers) | |
| data = {"model": model, "messages": messages, **kwargs} | |
| async with session.post(url, headers=headers, json=data) as response: | |
| status = response.status | |
| response_text = await response.text() | |
| print(response_text) | |
| print() | |
| if status != 200: | |
| if return_headers: | |
| return None, response.headers | |
| else: | |
| raise Exception(f"Request did not return a 200 status code: {status}") | |
| if return_headers: | |
| return await response.json(), response.headers | |
| else: | |
| return await response.json() | |
| async def test_chat_completion(): | |
| """ | |
| make chat completion call with prompt > context window. expect it to work with fallback | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| model = "gpt-3.5-turbo" | |
| messages = [ | |
| {"role": "system", "content": text}, | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| await chat_completion( | |
| session=session, key="sk-1234", model=model, messages=messages | |
| ) | |
| async def test_chat_completion_client_fallbacks(has_access): | |
| """ | |
| make chat completion call with prompt > context window. expect it to work with fallback | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| models = ["gpt-3.5-turbo"] | |
| if has_access: | |
| models.append("gpt-instruct") | |
| ## CREATE KEY WITH MODELS | |
| generated_key = await generate_key(session=session, i=0, models=models) | |
| calling_key = generated_key["key"] | |
| model = "gpt-3.5-turbo" | |
| messages = [ | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| ## CALL PROXY | |
| try: | |
| await chat_completion( | |
| session=session, | |
| key=calling_key, | |
| model=model, | |
| messages=messages, | |
| mock_testing_fallbacks=True, | |
| fallbacks=["gpt-instruct"], | |
| ) | |
| if not has_access: | |
| pytest.fail( | |
| "Expected this to fail, submitted fallback model that key did not have access to" | |
| ) | |
| except Exception as e: | |
| if has_access: | |
| pytest.fail("Expected this to work: {}".format(str(e))) | |
| async def test_chat_completion_with_retries(): | |
| """ | |
| make chat completion call with prompt > context window. expect it to work with fallback | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| model = "fake-openai-endpoint-4" | |
| messages = [ | |
| {"role": "system", "content": text}, | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| response, headers = await chat_completion( | |
| session=session, | |
| key="sk-1234", | |
| model=model, | |
| messages=messages, | |
| mock_testing_rate_limit_error=True, | |
| return_headers=True, | |
| ) | |
| print(f"headers: {headers}") | |
| assert headers["x-litellm-attempted-retries"] == "1" | |
| assert headers["x-litellm-max-retries"] == "50" | |
| async def test_chat_completion_with_fallbacks(): | |
| """ | |
| make chat completion call with prompt > context window. expect it to work with fallback | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| model = "badly-configured-openai-endpoint" | |
| messages = [ | |
| {"role": "system", "content": text}, | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| response, headers = await chat_completion( | |
| session=session, | |
| key="sk-1234", | |
| model=model, | |
| messages=messages, | |
| fallbacks=["fake-openai-endpoint-5"], | |
| return_headers=True, | |
| ) | |
| print(f"headers: {headers}") | |
| assert headers["x-litellm-attempted-fallbacks"] == "1" | |
| async def test_chat_completion_with_timeout(): | |
| """ | |
| make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers. | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| model = "fake-openai-endpoint-5" | |
| messages = [ | |
| {"role": "system", "content": text}, | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| start_time = time.time() | |
| response, headers = await chat_completion( | |
| session=session, | |
| key="sk-1234", | |
| model=model, | |
| messages=messages, | |
| num_retries=0, | |
| mock_timeout=True, | |
| return_headers=True, | |
| ) | |
| end_time = time.time() | |
| print(f"headers: {headers}") | |
| assert ( | |
| headers["x-litellm-timeout"] == "1.0" | |
| ) # assert model-specific timeout used | |
| async def test_chat_completion_with_timeout_from_request(): | |
| """ | |
| make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers. | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| model = "fake-openai-endpoint-5" | |
| messages = [ | |
| {"role": "system", "content": text}, | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| extra_headers = { | |
| "x-litellm-timeout": "0.001", | |
| } | |
| start_time = time.time() | |
| response, headers = await chat_completion( | |
| session=session, | |
| key="sk-1234", | |
| model=model, | |
| messages=messages, | |
| num_retries=0, | |
| mock_timeout=True, | |
| extra_headers=extra_headers, | |
| return_headers=True, | |
| ) | |
| end_time = time.time() | |
| print(f"headers: {headers}") | |
| assert ( | |
| headers["x-litellm-timeout"] == "0.001" | |
| ) # assert model-specific timeout used | |
| async def test_chat_completion_client_fallbacks_with_custom_message(has_access): | |
| """ | |
| make chat completion call with prompt > context window. expect it to work with fallback | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| models = ["gpt-3.5-turbo"] | |
| if has_access: | |
| models.append("gpt-instruct") | |
| ## CREATE KEY WITH MODELS | |
| generated_key = await generate_key(session=session, i=0, models=models) | |
| calling_key = generated_key["key"] | |
| model = "gpt-3.5-turbo" | |
| messages = [ | |
| {"role": "user", "content": "Who was Alexander?"}, | |
| ] | |
| ## CALL PROXY | |
| try: | |
| await chat_completion( | |
| session=session, | |
| key=calling_key, | |
| model=model, | |
| messages=messages, | |
| mock_testing_fallbacks=True, | |
| fallbacks=[ | |
| { | |
| "model": "gpt-instruct", | |
| "messages": [ | |
| { | |
| "role": "assistant", | |
| "content": "This is a custom message", | |
| } | |
| ], | |
| } | |
| ], | |
| ) | |
| if not has_access: | |
| pytest.fail( | |
| "Expected this to fail, submitted fallback model that key did not have access to" | |
| ) | |
| except Exception as e: | |
| if has_access: | |
| pytest.fail("Expected this to work: {}".format(str(e))) | |
| import asyncio | |
| from openai import AsyncOpenAI | |
| from typing import List | |
| import time | |
| async def make_request(client: AsyncOpenAI, model: str) -> bool: | |
| try: | |
| await client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": "Who was Alexander?"}], | |
| ) | |
| return True | |
| except Exception as e: | |
| print(f"Error with {model}: {str(e)}") | |
| return False | |
| async def run_good_model_test(client: AsyncOpenAI, num_requests: int) -> bool: | |
| tasks = [make_request(client, "good-model") for _ in range(num_requests)] | |
| good_results = await asyncio.gather(*tasks) | |
| return all(good_results) | |
| async def test_chat_completion_bad_and_good_model(): | |
| """ | |
| Prod test - ensure even if bad model is down, good model is still working. | |
| """ | |
| client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") | |
| num_requests = 100 | |
| num_iterations = 3 | |
| for iteration in range(num_iterations): | |
| print(f"\nIteration {iteration + 1}/{num_iterations}") | |
| start_time = time.time() | |
| # Fire and forget bad model requests | |
| for _ in range(num_requests): | |
| asyncio.create_task(make_request(client, "bad-model")) | |
| # Wait only for good model requests | |
| success = await run_good_model_test(client, num_requests) | |
| print( | |
| f"Iteration {iteration + 1}: {'✓' if success else '✗'} ({time.time() - start_time:.2f}s)" | |
| ) | |
| assert success, "Not all good model requests succeeded" | |