Kaan
app
c074c98 verified
raw
history blame
753 Bytes
from fastapi import FastAPI
from transformers import AutoModelForCausalLM, AutoTokenizer
from llama_cpp import Llama
# Create an instance of the FastAPI class
app = FastAPI()
# Define a route for the root endpoint
@app.get("/llm")
async def read_root():
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf",
verbose=False)
output = llm(
"Q: Name the planets in the solar system? A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output)
return {"message": output}