Spaces:
Paused
Paused
| import click | |
| import subprocess, traceback, json | |
| import os, sys | |
| import random | |
| import importlib | |
| def run_ollama_serve(): | |
| try: | |
| command = ["ollama", "serve"] | |
| with open(os.devnull, "w") as devnull: | |
| process = subprocess.Popen(command, stdout=devnull, stderr=devnull) | |
| except Exception as e: | |
| print( | |
| f""" | |
| LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve` | |
| """ | |
| ) # noqa | |
| def is_port_in_use(port): | |
| import socket | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| return s.connect_ex(("localhost", port)) == 0 | |
| def run_server( | |
| host = "0.0.0.0", | |
| port = 8000, | |
| api_base = None, | |
| api_version = "2023-07-01-preview", | |
| model = None, | |
| alias = None, | |
| add_key = None, | |
| headers = None, | |
| save = False, | |
| debug = False, | |
| detailed_debug = False, | |
| temperature = 0.0, | |
| max_tokens = 1000, | |
| request_timeout = 10, | |
| drop_params = True, | |
| add_function_to_prompt = True, | |
| config = None, | |
| max_budget = 100, | |
| telemetry = False, | |
| test = False, | |
| local = False, | |
| num_workers = 1, | |
| test_async = False, | |
| num_requests = 1, | |
| use_queue = False, | |
| health = False, | |
| version = False, | |
| ): | |
| global feature_telemetry | |
| args = locals() | |
| if local: | |
| from proxy_server import app, save_worker_config, usage_telemetry | |
| else: | |
| try: | |
| from .litellm.proxy.proxy_server import app, save_worker_config, usage_telemetry | |
| except ImportError as e: | |
| if "litellm[proxy]" in str(e): | |
| # user is missing a proxy dependency, ask them to pip install litellm[proxy] | |
| raise e | |
| else: | |
| # this is just a local/relative import error, user git cloned litellm | |
| from proxy_server import app, save_worker_config, usage_telemetry | |
| feature_telemetry = usage_telemetry | |
| if version == True: | |
| pkg_version = importlib.metadata.version("litellm") | |
| click.echo(f"\nLiteLLM: Current Version = {pkg_version}\n") | |
| return | |
| if model and "ollama" in model and api_base is None: | |
| run_ollama_serve() | |
| if test_async is True: | |
| import requests, concurrent, time | |
| api_base = f"http://{host}:{port}" | |
| def _make_openai_completion(): | |
| data = { | |
| "model": "gpt-3.5-turbo", | |
| "messages": [ | |
| {"role": "user", "content": "Write a short poem about the moon"} | |
| ], | |
| } | |
| response = requests.post("http://0.0.0.0:8000/queue/request", json=data) | |
| response = response.json() | |
| while True: | |
| try: | |
| url = response["url"] | |
| polling_url = f"{api_base}{url}" | |
| polling_response = requests.get(polling_url) | |
| polling_response = polling_response.json() | |
| print("\n RESPONSE FROM POLLING JOB", polling_response) | |
| status = polling_response["status"] | |
| if status == "finished": | |
| llm_response = polling_response["result"] | |
| break | |
| print( | |
| f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}" | |
| ) # noqa | |
| time.sleep(0.5) | |
| except Exception as e: | |
| print("got exception in polling", e) | |
| break | |
| # Number of concurrent calls (you can adjust this) | |
| concurrent_calls = num_requests | |
| # List to store the futures of concurrent calls | |
| futures = [] | |
| start_time = time.time() | |
| # Make concurrent calls | |
| with concurrent.futures.ThreadPoolExecutor( | |
| max_workers=concurrent_calls | |
| ) as executor: | |
| for _ in range(concurrent_calls): | |
| futures.append(executor.submit(_make_openai_completion)) | |
| # Wait for all futures to complete | |
| concurrent.futures.wait(futures) | |
| # Summarize the results | |
| successful_calls = 0 | |
| failed_calls = 0 | |
| for future in futures: | |
| if future.done(): | |
| if future.result() is not None: | |
| successful_calls += 1 | |
| else: | |
| failed_calls += 1 | |
| end_time = time.time() | |
| print(f"Elapsed Time: {end_time-start_time}") | |
| print(f"Load test Summary:") | |
| print(f"Total Requests: {concurrent_calls}") | |
| print(f"Successful Calls: {successful_calls}") | |
| print(f"Failed Calls: {failed_calls}") | |
| return | |
| if health != False: | |
| import requests | |
| print("\nLiteLLM: Health Testing models in config") | |
| response = requests.get(url=f"http://{host}:{port}/health") | |
| print(json.dumps(response.json(), indent=4)) | |
| return | |
| if test != False: | |
| request_model = model or "gpt-3.5-turbo" | |
| click.echo( | |
| f"\nLiteLLM: Making a test ChatCompletions request to your proxy. Model={request_model}" | |
| ) | |
| import openai | |
| if test == True: # flag value set | |
| api_base = f"http://{host}:{port}" | |
| else: | |
| api_base = test | |
| client = openai.OpenAI(api_key="My API Key", base_url=api_base) | |
| response = client.chat.completions.create( | |
| model=request_model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "this is a test request, write a short poem", | |
| } | |
| ], | |
| max_tokens=256, | |
| ) | |
| click.echo(f"\nLiteLLM: response from proxy {response}") | |
| print( | |
| f"\n LiteLLM: Making a test ChatCompletions + streaming request to proxy. Model={request_model}" | |
| ) | |
| response = client.chat.completions.create( | |
| model=request_model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "this is a test request, write a short poem", | |
| } | |
| ], | |
| stream=True, | |
| ) | |
| for chunk in response: | |
| click.echo(f"LiteLLM: streaming response from proxy {chunk}") | |
| print("\n making completion request to proxy") | |
| response = client.completions.create( | |
| model=request_model, prompt="this is a test request, write a short poem" | |
| ) | |
| print(response) | |
| return | |
| else: | |
| if headers: | |
| headers = json.loads(headers) | |
| save_worker_config( | |
| model=model, | |
| alias=alias, | |
| api_base=api_base, | |
| api_version=api_version, | |
| debug=debug, | |
| detailed_debug=detailed_debug, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| request_timeout=request_timeout, | |
| max_budget=max_budget, | |
| telemetry=telemetry, | |
| drop_params=drop_params, | |
| add_function_to_prompt=add_function_to_prompt, | |
| headers=headers, | |
| save=save, | |
| config=config, | |
| use_queue=use_queue, | |
| ) | |
| try: | |
| import uvicorn | |
| if os.name == "nt": | |
| pass | |
| else: | |
| import gunicorn.app.base | |
| except: | |
| raise ImportError( | |
| "Uvicorn, gunicorn needs to be imported. Run - `pip 'litellm[proxy]'`" | |
| ) | |
| if config is not None: | |
| """ | |
| Allow user to pass in db url via config | |
| read from there and save it to os.env['DATABASE_URL'] | |
| """ | |
| try: | |
| import yaml | |
| except: | |
| raise ImportError( | |
| "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`" | |
| ) | |
| if os.path.exists(config): | |
| with open(config, "r") as config_file: | |
| config = yaml.safe_load(config_file) | |
| general_settings = config.get("general_settings", {}) | |
| database_url = general_settings.get("database_url", None) | |
| if database_url and database_url.startswith("os.environ/"): | |
| original_dir = os.getcwd() | |
| # set the working directory to where this script is | |
| sys.path.insert( | |
| 0, os.path.abspath("../..") | |
| ) # Adds the parent directory to the system path - for litellm local dev | |
| import litellm | |
| database_url = litellm.get_secret(database_url) | |
| os.chdir(original_dir) | |
| if database_url is not None and isinstance(database_url, str): | |
| os.environ["DATABASE_URL"] = database_url | |
| if os.getenv("DATABASE_URL", None) is not None: | |
| try: | |
| subprocess.run(["prisma"], capture_output=True) | |
| is_prisma_runnable = True | |
| except FileNotFoundError: | |
| is_prisma_runnable = False | |
| if is_prisma_runnable: | |
| # run prisma db push, before starting server | |
| # Save the current working directory | |
| original_dir = os.getcwd() | |
| # set the working directory to where this script is | |
| abspath = os.path.abspath(__file__) | |
| dname = os.path.dirname(abspath) | |
| os.chdir(dname) | |
| try: | |
| subprocess.run( | |
| ["prisma", "db", "push", "--accept-data-loss"] | |
| ) # this looks like a weird edge case when prisma just wont start on render. we need to have the --accept-data-loss | |
| finally: | |
| os.chdir(original_dir) | |
| else: | |
| print( | |
| f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found." | |
| ) | |
| if port == 8000 and is_port_in_use(port): | |
| port = random.randint(1024, 49152) | |
| from litellm.proxy.proxy_server import app | |
| uvicorn.run(app, host=host, port=port) # run uvicorn | |
| # if os.name == "nt": | |
| # else: | |
| # import gunicorn.app.base | |
| # # Gunicorn Application Class | |
| # class StandaloneApplication(gunicorn.app.base.BaseApplication): | |
| # def __init__(self, app, options=None): | |
| # self.options = options or {} # gunicorn options | |
| # self.application = app # FastAPI app | |
| # super().__init__() | |
| # _endpoint_str = ( | |
| # f"curl --location 'http://0.0.0.0:{port}/chat/completions' \\" | |
| # ) | |
| # curl_command = ( | |
| # _endpoint_str | |
| # + """ | |
| # --header 'Content-Type: application/json' \\ | |
| # --data ' { | |
| # "model": "gpt-3.5-turbo", | |
| # "messages": [ | |
| # { | |
| # "role": "user", | |
| # "content": "what llm are you" | |
| # } | |
| # ] | |
| # }' | |
| # \n | |
| # """ | |
| # ) | |
| # print() # noqa | |
| # print( # noqa | |
| # f'\033[1;34mLiteLLM: Test your local proxy with: "litellm --test" This runs an openai.ChatCompletion request to your proxy [In a new terminal tab]\033[0m\n' | |
| # ) | |
| # print( # noqa | |
| # f"\033[1;34mLiteLLM: Curl Command Test for your local proxy\n {curl_command} \033[0m\n" | |
| # ) | |
| # print( | |
| # "\033[1;34mDocs: https://docs.litellm.ai/docs/simple_proxy\033[0m\n" | |
| # ) # noqa | |
| # print( # noqa | |
| # f"\033[1;34mSee all Router/Swagger docs on http://0.0.0.0:{port} \033[0m\n" | |
| # ) # noqa | |
| # def load_config(self): | |
| # # note: This Loads the gunicorn config - has nothing to do with LiteLLM Proxy config | |
| # config = { | |
| # key: value | |
| # for key, value in self.options.items() | |
| # if key in self.cfg.settings and value is not None | |
| # } | |
| # for key, value in config.items(): | |
| # self.cfg.set(key.lower(), value) | |
| # def load(self): | |
| # # gunicorn app function | |
| # return self.application | |
| # gunicorn_options = { | |
| # "bind": f"{host}:{port}", | |
| # "workers": num_workers, # default is 1 | |
| # "worker_class": "uvicorn.workers.UvicornWorker", | |
| # "preload": True, # Add the preload flag, | |
| # "accesslog": "-", # Log to stdout | |
| # "access_log_format": '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s', | |
| # } | |
| # StandaloneApplication( | |
| # app=app, options=gunicorn_options | |
| # ).run() # Run gunicorn | |
| if __name__ == "__main__": | |
| run_server() | |