litellmlope

Paused

App Files Files Community

litellmlope / litellm /tests /test_azure_perf.py

ka1kuk

Upload 235 files

7db0ae4 verified almost 2 years ago

raw

history blame

4.03 kB

	#### What this tests ####
	# This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
	import sys, os, time, inspect, asyncio, traceback
	from datetime import datetime
	import pytest

	sys.path.insert(0, os.path.abspath("../.."))
	import openai, litellm, uuid
	from openai import AsyncAzureOpenAI

	client = AsyncAzureOpenAI(
	api_key=os.getenv("AZURE_API_KEY"),
	azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore
	api_version=os.getenv("AZURE_API_VERSION"),
	)

	model_list = [
	{
	"model_name": "azure-test",
	"litellm_params": {
	"model": "azure/chatgpt-v-2",
	"api_key": os.getenv("AZURE_API_KEY"),
	"api_base": os.getenv("AZURE_API_BASE"),
	"api_version": os.getenv("AZURE_API_VERSION"),
	},
	}
	]

	router = litellm.Router(model_list=model_list)


	async def _openai_completion():
	try:
	start_time = time.time()
	response = await client.chat.completions.create(
	model="chatgpt-v-2",
	messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
	stream=True,
	)
	time_to_first_token = None
	first_token_ts = None
	init_chunk = None
	async for chunk in response:
	if (
	time_to_first_token is None
	and len(chunk.choices) > 0
	and chunk.choices[0].delta.content is not None
	):
	first_token_ts = time.time()
	time_to_first_token = first_token_ts - start_time
	init_chunk = chunk
	end_time = time.time()
	print(
	"OpenAI Call: ",
	init_chunk,
	start_time,
	first_token_ts,
	time_to_first_token,
	end_time,
	)
	return time_to_first_token
	except Exception as e:
	print(e)
	return None


	async def _router_completion():
	try:
	start_time = time.time()
	response = await router.acompletion(
	model="azure-test",
	messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
	stream=True,
	)
	time_to_first_token = None
	first_token_ts = None
	init_chunk = None
	async for chunk in response:
	if (
	time_to_first_token is None
	and len(chunk.choices) > 0
	and chunk.choices[0].delta.content is not None
	):
	first_token_ts = time.time()
	time_to_first_token = first_token_ts - start_time
	init_chunk = chunk
	end_time = time.time()
	print(
	"Router Call: ",
	init_chunk,
	start_time,
	first_token_ts,
	time_to_first_token,
	end_time - first_token_ts,
	)
	return time_to_first_token
	except Exception as e:
	print(e)
	return None


	async def test_azure_completion_streaming():
	"""
	Test azure streaming call - measure on time to first (non-null) token.
	"""
	n = 3 # Number of concurrent tasks
	## OPENAI AVG. TIME
	tasks = [_openai_completion() for _ in range(n)]
	chat_completions = await asyncio.gather(*tasks)
	successful_completions = [c for c in chat_completions if c is not None]
	total_time = 0
	for item in successful_completions:
	total_time += item
	avg_openai_time = total_time / 3
	## ROUTER AVG. TIME
	tasks = [_router_completion() for _ in range(n)]
	chat_completions = await asyncio.gather(*tasks)
	successful_completions = [c for c in chat_completions if c is not None]
	total_time = 0
	for item in successful_completions:
	total_time += item
	avg_router_time = total_time / 3
	## COMPARE
	print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
	assert avg_router_time < avg_openai_time + 0.5


	# asyncio.run(test_azure_completion_streaming())